• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 const int32_t kMaxStateFor8BitsTable = 255;
32 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34         fRB(rb),
35         fTree(*rootNode),
36         fStatus(&status),
37         fDStates(nullptr),
38         fSafeTable(nullptr) {
39     if (U_FAILURE(status)) {
40         return;
41     }
42     // fDStates is UVector<RBBIStateDescriptor *>
43     fDStates = new UVector(status);
44     if (U_SUCCESS(status) && fDStates == nullptr ) {
45         status = U_MEMORY_ALLOCATION_ERROR;
46     }
47 }
48 
49 
50 
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52     int i;
53     for (i=0; i<fDStates->size(); i++) {
54         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55     }
56     delete fDStates;
57     delete fSafeTable;
58     delete fLookAheadRuleMap;
59 }
60 
61 
62 //-----------------------------------------------------------------------------
63 //
64 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
65 //                               the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void  RBBITableBuilder::buildForwardTable() {
69 
70     if (U_FAILURE(*fStatus)) {
71         return;
72     }
73 
74     // If there were no rules, just return.  This situation can easily arise
75     //   for the reverse rules.
76     if (fTree==nullptr) {
77         return;
78     }
79 
80     //
81     // Walk through the tree, replacing any references to $variables with a copy of the
82     //   parse tree for the substitution expression.
83     //
84     fTree = fTree->flattenVariables(*fStatus, 0);
85     if (U_FAILURE(*fStatus)) {
86         return;
87     }
88 #ifdef RBBI_DEBUG
89     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
90         RBBIDebugPuts("\nParse tree after flattening variable references.");
91         RBBINode::printTree(fTree, true);
92     }
93 #endif
94 
95     //
96     // If the rules contained any references to {bof}
97     //   add a {bof} <cat> <former root of tree> to the
98     //   tree.  Means that all matches must start out with the
99     //   {bof} fake character.
100     //
101     if (fRB->fSetBuilder->sawBOF()) {
102         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
103         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
104         // Delete and exit if memory allocation failed.
105         if (bofTop == nullptr || bofLeaf == nullptr) {
106             *fStatus = U_MEMORY_ALLOCATION_ERROR;
107             delete bofTop;
108             delete bofLeaf;
109             return;
110         }
111         bofTop->fLeftChild  = bofLeaf;
112         bofTop->fRightChild = fTree;
113         bofLeaf->fParent    = bofTop;
114         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
115         fTree               = bofTop;
116     }
117 
118     //
119     // Add a unique right-end marker to the expression.
120     //   Appears as a cat-node, left child being the original tree,
121     //   right child being the end marker.
122     //
123     RBBINode *cn = new RBBINode(RBBINode::opCat);
124     // Exit if memory allocation failed.
125     if (cn == nullptr) {
126         *fStatus = U_MEMORY_ALLOCATION_ERROR;
127         return;
128     }
129     cn->fLeftChild = fTree;
130     fTree->fParent = cn;
131     RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
132     // Delete and exit if memory allocation failed.
133     if (cn->fRightChild == nullptr) {
134         *fStatus = U_MEMORY_ALLOCATION_ERROR;
135         delete cn;
136         return;
137     }
138     cn->fRightChild->fParent = cn;
139     fTree = cn;
140 
141     //
142     //  Replace all references to UnicodeSets with the tree for the equivalent
143     //      expression.
144     //
145     fTree->flattenSets();
146 #ifdef RBBI_DEBUG
147     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
148         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
149         RBBINode::printTree(fTree, true);
150     }
151 #endif
152 
153 
154     //
155     // calculate the functions nullable, firstpos, lastpos and followpos on
156     // nodes in the parse tree.
157     //    See the algorithm description in Aho.
158     //    Understanding how this works by looking at the code alone will be
159     //       nearly impossible.
160     //
161     calcNullable(fTree);
162     calcFirstPos(fTree);
163     calcLastPos(fTree);
164     calcFollowPos(fTree);
165     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
166         RBBIDebugPuts("\n");
167         printPosSets(fTree);
168     }
169 
170     //
171     //  For "chained" rules, modify the followPos sets
172     //
173     if (fRB->fChainRules) {
174         calcChainedFollowPos(fTree, endMarkerNode);
175     }
176 
177     //
178     //  BOF (start of input) test fixup.
179     //
180     if (fRB->fSetBuilder->sawBOF()) {
181         bofFixup();
182     }
183 
184     //
185     // Build the DFA state transition tables.
186     //
187     buildStateTable();
188     mapLookAheadRules();
189     flagAcceptingStates();
190     flagLookAheadStates();
191     flagTaggedStates();
192 
193     //
194     // Update the global table of rule status {tag} values
195     // The rule builder has a global vector of status values that are common
196     //    for all tables.  Merge the ones from this table into the global set.
197     //
198     mergeRuleStatusVals();
199 }
200 
201 
202 
203 //-----------------------------------------------------------------------------
204 //
205 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
206 //
207 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)208 void RBBITableBuilder::calcNullable(RBBINode *n) {
209     if (n == nullptr) {
210         return;
211     }
212     if (n->fType == RBBINode::setRef ||
213         n->fType == RBBINode::endMark ) {
214         // These are non-empty leaf node types.
215         n->fNullable = false;
216         return;
217     }
218 
219     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
220         // Lookahead marker node.  It's a leaf, so no recursion on children.
221         // It's nullable because it does not match any literal text from the input stream.
222         n->fNullable = true;
223         return;
224     }
225 
226 
227     // The node is not a leaf.
228     //  Calculate nullable on its children.
229     calcNullable(n->fLeftChild);
230     calcNullable(n->fRightChild);
231 
232     // Apply functions from table 3.40 in Aho
233     if (n->fType == RBBINode::opOr) {
234         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
235     }
236     else if (n->fType == RBBINode::opCat) {
237         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
238     }
239     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
240         n->fNullable = true;
241     }
242     else {
243         n->fNullable = false;
244     }
245 }
246 
247 
248 
249 
250 //-----------------------------------------------------------------------------
251 //
252 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
253 //
254 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)255 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
256     if (n == nullptr) {
257         return;
258     }
259     if (n->fType == RBBINode::leafChar  ||
260         n->fType == RBBINode::endMark   ||
261         n->fType == RBBINode::lookAhead ||
262         n->fType == RBBINode::tag) {
263         // These are non-empty leaf node types.
264         // Note: In order to maintain the sort invariant on the set,
265         // this function should only be called on a node whose set is
266         // empty to start with.
267         n->fFirstPosSet->addElement(n, *fStatus);
268         return;
269     }
270 
271     // The node is not a leaf.
272     //  Calculate firstPos on its children.
273     calcFirstPos(n->fLeftChild);
274     calcFirstPos(n->fRightChild);
275 
276     // Apply functions from table 3.40 in Aho
277     if (n->fType == RBBINode::opOr) {
278         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
279         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
280     }
281     else if (n->fType == RBBINode::opCat) {
282         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283         if (n->fLeftChild->fNullable) {
284             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
285         }
286     }
287     else if (n->fType == RBBINode::opStar ||
288              n->fType == RBBINode::opQuestion ||
289              n->fType == RBBINode::opPlus) {
290         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
291     }
292 }
293 
294 
295 
296 //-----------------------------------------------------------------------------
297 //
298 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
299 //
300 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)301 void RBBITableBuilder::calcLastPos(RBBINode *n) {
302     if (n == nullptr) {
303         return;
304     }
305     if (n->fType == RBBINode::leafChar  ||
306         n->fType == RBBINode::endMark   ||
307         n->fType == RBBINode::lookAhead ||
308         n->fType == RBBINode::tag) {
309         // These are non-empty leaf node types.
310         // Note: In order to maintain the sort invariant on the set,
311         // this function should only be called on a node whose set is
312         // empty to start with.
313         n->fLastPosSet->addElement(n, *fStatus);
314         return;
315     }
316 
317     // The node is not a leaf.
318     //  Calculate lastPos on its children.
319     calcLastPos(n->fLeftChild);
320     calcLastPos(n->fRightChild);
321 
322     // Apply functions from table 3.40 in Aho
323     if (n->fType == RBBINode::opOr) {
324         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
325         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326     }
327     else if (n->fType == RBBINode::opCat) {
328         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
329         if (n->fRightChild->fNullable) {
330             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
331         }
332     }
333     else if (n->fType == RBBINode::opStar     ||
334              n->fType == RBBINode::opQuestion ||
335              n->fType == RBBINode::opPlus) {
336         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
337     }
338 }
339 
340 
341 
342 //-----------------------------------------------------------------------------
343 //
344 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
345 //
346 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)347 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
348     if (n == nullptr ||
349         n->fType == RBBINode::leafChar ||
350         n->fType == RBBINode::endMark) {
351         return;
352     }
353 
354     calcFollowPos(n->fLeftChild);
355     calcFollowPos(n->fRightChild);
356 
357     // Aho rule #1
358     if (n->fType == RBBINode::opCat) {
359         RBBINode *i;   // is 'i' in Aho's description
360         uint32_t     ix;
361 
362         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
363 
364         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
365             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
366             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
367         }
368     }
369 
370     // Aho rule #2
371     if (n->fType == RBBINode::opStar ||
372         n->fType == RBBINode::opPlus) {
373         RBBINode   *i;  // again, n and i are the names from Aho's description.
374         uint32_t    ix;
375 
376         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
377             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
378             setAdd(i->fFollowPos, n->fFirstPosSet);
379         }
380     }
381 
382 
383 
384 }
385 
386 //-----------------------------------------------------------------------------
387 //
388 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
389 //                        as roots of a rule to a destination vector.
390 //
391 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)392 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
393     if (node == nullptr || U_FAILURE(*fStatus)) {
394         return;
395     }
396     U_ASSERT(!dest->hasDeleter());
397     if (node->fRuleRoot) {
398         dest->addElement(node, *fStatus);
399         // Note: rules cannot nest. If we found a rule start node,
400         //       no child node can also be a start node.
401         return;
402     }
403     addRuleRootNodes(dest, node->fLeftChild);
404     addRuleRootNodes(dest, node->fRightChild);
405 }
406 
407 //-----------------------------------------------------------------------------
408 //
409 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
410 //                            to implement rule chaining.  NOT described by Aho
411 //
412 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)413 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
414 
415     UVector         leafNodes(*fStatus);
416     if (U_FAILURE(*fStatus)) {
417         return;
418     }
419 
420     // get a list all leaf nodes
421     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
422     if (U_FAILURE(*fStatus)) {
423         return;
424     }
425 
426     // Collect all leaf nodes that can start matches for rules
427     // with inbound chaining enabled, which is the union of the
428     // firstPosition sets from each of the rule root nodes.
429 
430     UVector ruleRootNodes(*fStatus);
431     addRuleRootNodes(&ruleRootNodes, tree);
432 
433     UVector matchStartNodes(*fStatus);
434     for (int j=0; j<ruleRootNodes.size(); ++j) {
435         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
436         if (node->fChainIn) {
437             setAdd(&matchStartNodes, node->fFirstPosSet);
438         }
439     }
440     if (U_FAILURE(*fStatus)) {
441         return;
442     }
443 
444     int32_t  endNodeIx;
445     int32_t  startNodeIx;
446 
447     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
448         RBBINode *endNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
449 
450         // Identify leaf nodes that correspond to overall rule match positions.
451         // These include the endMarkNode in their followPos sets.
452         //
453         // Note: do not consider other end marker nodes, those that are added to
454         //       look-ahead rules. These can't chain; a match immediately stops
455         //       further matching. This leaves exactly one end marker node, the one
456         //       at the end of the complete tree.
457 
458         if (!endNode->fFollowPos->contains(endMarkNode)) {
459             continue;
460         }
461 
462         // We've got a node that can end a match.
463 
464         // Now iterate over the nodes that can start a match, looking for ones
465         //   with the same char class as our ending node.
466         RBBINode *startNode;
467         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
468             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
469             if (startNode->fType != RBBINode::leafChar) {
470                 continue;
471             }
472 
473             if (endNode->fVal == startNode->fVal) {
474                 // The end val (character class) of one possible match is the
475                 //   same as the start of another.
476 
477                 // Add all nodes from the followPos of the start node to the
478                 //  followPos set of the end node, which will have the effect of
479                 //  letting matches transition from a match state at endNode
480                 //  to the second char of a match starting with startNode.
481                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
482             }
483         }
484     }
485 }
486 
487 
488 //-----------------------------------------------------------------------------
489 //
490 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
491 //                Do an swizzle similar to chaining, modifying the followPos set of
492 //                the bofNode to include the followPos nodes from other {bot} nodes
493 //                scattered through the tree.
494 //
495 //                This function has much in common with calcChainedFollowPos().
496 //
497 //-----------------------------------------------------------------------------
bofFixup()498 void RBBITableBuilder::bofFixup() {
499 
500     if (U_FAILURE(*fStatus)) {
501         return;
502     }
503 
504     //   The parse tree looks like this ...
505     //         fTree root  --->       <cat>
506     //                               /     \       .
507     //                            <cat>   <#end node>
508     //                           /     \  .
509     //                     <bofNode>   rest
510     //                               of tree
511     //
512     //    We will be adding things to the followPos set of the <bofNode>
513     //
514     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
515     U_ASSERT(bofNode->fType == RBBINode::leafChar);
516     U_ASSERT(bofNode->fVal == 2);
517 
518     // Get all nodes that can be the start a match of the user-written rules
519     //  (excluding the fake bofNode)
520     //  We want the nodes that can start a match in the
521     //     part labeled "rest of tree"
522     //
523     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
524 
525     RBBINode *startNode;
526     int       startNodeIx;
527     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
528         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
529         if (startNode->fType != RBBINode::leafChar) {
530             continue;
531         }
532 
533         if (startNode->fVal == bofNode->fVal) {
534             //  We found a leaf node corresponding to a {bof} that was
535             //    explicitly written into a rule.
536             //  Add everything from the followPos set of this node to the
537             //    followPos set of the fake bofNode at the start of the tree.
538             //
539             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
540         }
541     }
542 }
543 
544 //-----------------------------------------------------------------------------
545 //
546 //   buildStateTable()    Determine the set of runtime DFA states and the
547 //                        transition tables for these states, by the algorithm
548 //                        of fig. 3.44 in Aho.
549 //
550 //                        Most of the comments are quotes of Aho's psuedo-code.
551 //
552 //-----------------------------------------------------------------------------
buildStateTable()553 void RBBITableBuilder::buildStateTable() {
554     if (U_FAILURE(*fStatus)) {
555         return;
556     }
557     RBBIStateDescriptor *failState;
558     // Set it to nullptr to avoid uninitialized warning
559     RBBIStateDescriptor *initialState = nullptr;
560     //
561     // Add a dummy state 0 - the stop state.  Not from Aho.
562     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
563     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
564     if (failState == nullptr) {
565         *fStatus = U_MEMORY_ALLOCATION_ERROR;
566         goto ExitBuildSTdeleteall;
567     }
568     failState->fPositions = new UVector(*fStatus);
569     if (failState->fPositions == nullptr) {
570         *fStatus = U_MEMORY_ALLOCATION_ERROR;
571     }
572     if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) {
573         goto ExitBuildSTdeleteall;
574     }
575     fDStates->addElement(failState, *fStatus);
576     if (U_FAILURE(*fStatus)) {
577         goto ExitBuildSTdeleteall;
578     }
579 
580     // initially, the only unmarked state in Dstates is firstpos(root),
581     //       where toot is the root of the syntax tree for (r)#;
582     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
583     if (initialState == nullptr) {
584         *fStatus = U_MEMORY_ALLOCATION_ERROR;
585     }
586     if (U_FAILURE(*fStatus)) {
587         goto ExitBuildSTdeleteall;
588     }
589     initialState->fPositions = new UVector(*fStatus);
590     if (initialState->fPositions == nullptr) {
591         *fStatus = U_MEMORY_ALLOCATION_ERROR;
592     }
593     if (U_FAILURE(*fStatus)) {
594         goto ExitBuildSTdeleteall;
595     }
596     setAdd(initialState->fPositions, fTree->fFirstPosSet);
597     fDStates->addElement(initialState, *fStatus);
598     if (U_FAILURE(*fStatus)) {
599         goto ExitBuildSTdeleteall;
600     }
601 
602     // while there is an unmarked state T in Dstates do begin
603     for (;;) {
604         RBBIStateDescriptor *T = nullptr;
605         int32_t              tx;
606         for (tx=1; tx<fDStates->size(); tx++) {
607             RBBIStateDescriptor *temp;
608             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
609             if (temp->fMarked == false) {
610                 T = temp;
611                 break;
612             }
613         }
614         if (T == nullptr) {
615             break;
616         }
617 
618         // mark T;
619         T->fMarked = true;
620 
621         // for each input symbol a do begin
622         int32_t  a;
623         for (a = 1; a<=lastInputSymbol; a++) {
624             // let U be the set of positions that are in followpos(p)
625             //    for some position p in T
626             //    such that the symbol at position p is a;
627             UVector    *U = nullptr;
628             RBBINode   *p;
629             int32_t     px;
630             for (px=0; px<T->fPositions->size(); px++) {
631                 p = (RBBINode *)T->fPositions->elementAt(px);
632                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
633                     if (U == nullptr) {
634                         U = new UVector(*fStatus);
635                         if (U == nullptr) {
636                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
637                         	goto ExitBuildSTdeleteall;
638                         }
639                     }
640                     setAdd(U, p->fFollowPos);
641                 }
642             }
643 
644             // if U is not empty and not in DStates then
645             int32_t  ux = 0;
646             UBool    UinDstates = false;
647             if (U != nullptr) {
648                 U_ASSERT(U->size() > 0);
649                 int  ix;
650                 for (ix=0; ix<fDStates->size(); ix++) {
651                     RBBIStateDescriptor *temp2;
652                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
653                     if (setEquals(U, temp2->fPositions)) {
654                         delete U;
655                         U  = temp2->fPositions;
656                         ux = ix;
657                         UinDstates = true;
658                         break;
659                     }
660                 }
661 
662                 // Add U as an unmarked state to Dstates
663                 if (!UinDstates)
664                 {
665                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
666                     if (newState == nullptr) {
667                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
668                     }
669                     if (U_FAILURE(*fStatus)) {
670                         goto ExitBuildSTdeleteall;
671                     }
672                     newState->fPositions = U;
673                     fDStates->addElement(newState, *fStatus);
674                     if (U_FAILURE(*fStatus)) {
675                         return;
676                     }
677                     ux = fDStates->size()-1;
678                 }
679 
680                 // Dtran[T, a] := U;
681                 T->fDtran->setElementAt(ux, a);
682             }
683         }
684     }
685     return;
686     // delete local pointers only if error occurred.
687 ExitBuildSTdeleteall:
688     delete initialState;
689     delete failState;
690 }
691 
692 
693 /**
694  * mapLookAheadRules
695  *
696  */
mapLookAheadRules()697 void RBBITableBuilder::mapLookAheadRules() {
698     fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
699     if (fLookAheadRuleMap == nullptr) {
700         *fStatus = U_MEMORY_ALLOCATION_ERROR;
701     }
702     if (U_FAILURE(*fStatus)) {
703         return;
704     }
705     fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
706 
707     for (int32_t n=0; n<fDStates->size(); n++) {
708         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
709         int32_t laSlotForState = 0;
710 
711         // Establish the look-ahead slot for this state, if the state covers
712         // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
713 
714         // If any of the look-ahead nodes already have a slot assigned, use it,
715         // otherwise assign a new one.
716 
717         bool sawLookAheadNode = false;
718         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
719             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
720             if (node->fType != RBBINode::NodeType::lookAhead) {
721                 continue;
722             }
723             sawLookAheadNode = true;
724             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
725             U_ASSERT(ruleNum < fLookAheadRuleMap->size());
726             U_ASSERT(ruleNum > 0);
727             int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
728             if (laSlot != 0) {
729                 if (laSlotForState == 0) {
730                     laSlotForState = laSlot;
731                 } else {
732                     // TODO: figure out if this can fail, change to setting an error code if so.
733                     U_ASSERT(laSlot == laSlotForState);
734                 }
735             }
736         }
737         if (!sawLookAheadNode) {
738             continue;
739         }
740 
741         if (laSlotForState == 0) {
742             laSlotForState = ++fLASlotsInUse;
743         }
744 
745         // For each look ahead node covered by this state,
746         // set the mapping from the node's rule number to the look ahead slot.
747         // There can be multiple nodes/rule numbers going to the same la slot.
748 
749         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
750             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
751             if (node->fType != RBBINode::NodeType::lookAhead) {
752                 continue;
753             }
754             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
755             int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
756             (void)existingVal;
757             U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
758             fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
759         }
760     }
761 
762 }
763 
764 //-----------------------------------------------------------------------------
765 //
766 //   flagAcceptingStates    Identify accepting states.
767 //                          First get a list of all of the end marker nodes.
768 //                          Then, for each state s,
769 //                              if s contains one of the end marker nodes in its list of tree positions then
770 //                                  s is an accepting state.
771 //
772 //-----------------------------------------------------------------------------
flagAcceptingStates()773 void     RBBITableBuilder::flagAcceptingStates() {
774     if (U_FAILURE(*fStatus)) {
775         return;
776     }
777     UVector     endMarkerNodes(*fStatus);
778     RBBINode    *endMarker;
779     int32_t     i;
780     int32_t     n;
781 
782     if (U_FAILURE(*fStatus)) {
783         return;
784     }
785 
786     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
787     if (U_FAILURE(*fStatus)) {
788         return;
789     }
790 
791     for (i=0; i<endMarkerNodes.size(); i++) {
792         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
793         for (n=0; n<fDStates->size(); n++) {
794             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
795             if (sd->fPositions->indexOf(endMarker) >= 0) {
796                 // Any non-zero value for fAccepting means this is an accepting node.
797                 // The value is what will be returned to the user as the break status.
798                 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
799 
800                 if (sd->fAccepting==0) {
801                     // State hasn't been marked as accepting yet.  Do it now.
802                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
803                     if (sd->fAccepting == 0) {
804                         sd->fAccepting = ACCEPTING_UNCONDITIONAL;
805                     }
806                 }
807                 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
808                     // Both lookahead and non-lookahead accepting for this state.
809                     // Favor the look-ahead, because a look-ahead match needs to
810                     // immediately stop the run-time engine. First match, not longest.
811                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
812                 }
813                 // implicit else:
814                 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
815             }
816         }
817     }
818 }
819 
820 
821 //-----------------------------------------------------------------------------
822 //
823 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
824 //
825 //-----------------------------------------------------------------------------
flagLookAheadStates()826 void     RBBITableBuilder::flagLookAheadStates() {
827     if (U_FAILURE(*fStatus)) {
828         return;
829     }
830     UVector     lookAheadNodes(*fStatus);
831     RBBINode    *lookAheadNode;
832     int32_t     i;
833     int32_t     n;
834 
835     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
836     if (U_FAILURE(*fStatus)) {
837         return;
838     }
839     for (i=0; i<lookAheadNodes.size(); i++) {
840         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
841         U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
842 
843         for (n=0; n<fDStates->size(); n++) {
844             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
845             int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
846             if (positionsIdx >= 0) {
847                 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
848                 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
849                 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
850                 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
851                 //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
852                 //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
853                 // }
854                 sd->fLookAhead = lookaheadSlot;
855             }
856         }
857     }
858 }
859 
860 
861 
862 
863 //-----------------------------------------------------------------------------
864 //
865 //    flagTaggedStates
866 //
867 //-----------------------------------------------------------------------------
flagTaggedStates()868 void     RBBITableBuilder::flagTaggedStates() {
869     if (U_FAILURE(*fStatus)) {
870         return;
871     }
872     UVector     tagNodes(*fStatus);
873     RBBINode    *tagNode;
874     int32_t     i;
875     int32_t     n;
876 
877     if (U_FAILURE(*fStatus)) {
878         return;
879     }
880     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
881     if (U_FAILURE(*fStatus)) {
882         return;
883     }
884     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
885         tagNode = (RBBINode *)tagNodes.elementAt(i);
886 
887         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
888             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
889             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
890                 sortedAdd(&sd->fTagVals, tagNode->fVal);
891             }
892         }
893     }
894 }
895 
896 
897 
898 
899 //-----------------------------------------------------------------------------
900 //
901 //  mergeRuleStatusVals
902 //
903 //      Update the global table of rule status {tag} values
904 //      The rule builder has a global vector of status values that are common
905 //      for all tables.  Merge the ones from this table into the global set.
906 //
907 //-----------------------------------------------------------------------------
mergeRuleStatusVals()908 void  RBBITableBuilder::mergeRuleStatusVals() {
909     //
910     //  The basic outline of what happens here is this...
911     //
912     //    for each state in this state table
913     //       if the status tag list for this state is in the global statuses list
914     //           record where and
915     //           continue with the next state
916     //       else
917     //           add the tag list for this state to the global list.
918     //
919     int i;
920     int n;
921 
922     // Pre-set a single tag of {0} into the table.
923     //   We will need this as a default, for rule sets with no explicit tagging.
924     if (fRB->fRuleStatusVals->size() == 0) {
925         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
926         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
927     }
928 
929     //    For each state
930     for (n=0; n<fDStates->size(); n++) {
931         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
932         UVector *thisStatesTagValues = sd->fTagVals;
933         if (thisStatesTagValues == nullptr) {
934             // No tag values are explicitly associated with this state.
935             //   Set the default tag value.
936             sd->fTagsIdx = 0;
937             continue;
938         }
939 
940         // There are tag(s) associated with this state.
941         //   fTagsIdx will be the index into the global tag list for this state's tag values.
942         //   Initial value of -1 flags that we haven't got it set yet.
943         sd->fTagsIdx = -1;
944         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
945         int32_t  nextTagGroupStart = 0;
946 
947         // Loop runs once per group of tags in the global list
948         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
949             thisTagGroupStart = nextTagGroupStart;
950             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
951             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
952                 // The number of tags for this state is different from
953                 //    the number of tags in this group from the global list.
954                 //    Continue with the next group from the global list.
955                 continue;
956             }
957             // The lengths match, go ahead and compare the actual tag values
958             //    between this state and the group from the global list.
959             for (i=0; i<thisStatesTagValues->size(); i++) {
960                 if (thisStatesTagValues->elementAti(i) !=
961                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
962                     // Mismatch.
963                     break;
964                 }
965             }
966 
967             if (i == thisStatesTagValues->size()) {
968                 // We found a set of tag values in the global list that match
969                 //   those for this state.  Use them.
970                 sd->fTagsIdx = thisTagGroupStart;
971                 break;
972             }
973         }
974 
975         if (sd->fTagsIdx == -1) {
976             // No suitable entry in the global tag list already.  Add one
977             sd->fTagsIdx = fRB->fRuleStatusVals->size();
978             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
979             for (i=0; i<thisStatesTagValues->size(); i++) {
980                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
981             }
982         }
983     }
984 }
985 
986 
987 
988 
989 
990 
991 
992 //-----------------------------------------------------------------------------
993 //
994 //  sortedAdd  Add a value to a vector of sorted values (ints).
995 //             Do not replicate entries; if the value is already there, do not
996 //                add a second one.
997 //             Lazily create the vector if it does not already exist.
998 //
999 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1000 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1001     int32_t i;
1002 
1003     if (*vector == nullptr) {
1004         *vector = new UVector(*fStatus);
1005     }
1006     if (*vector == nullptr || U_FAILURE(*fStatus)) {
1007         return;
1008     }
1009     UVector *vec = *vector;
1010     int32_t  vSize = vec->size();
1011     for (i=0; i<vSize; i++) {
1012         int32_t valAtI = vec->elementAti(i);
1013         if (valAtI == val) {
1014             // The value is already in the vector.  Don't add it again.
1015             return;
1016         }
1017         if (valAtI > val) {
1018             break;
1019         }
1020     }
1021     vec->insertElementAt(val, i, *fStatus);
1022 }
1023 
1024 
1025 
1026 //-----------------------------------------------------------------------------
1027 //
1028 //  setAdd     Set operation on UVector
1029 //             dest = dest union source
1030 //             Elements may only appear once and must be sorted.
1031 //
1032 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1033 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1034     U_ASSERT(!dest->hasDeleter());
1035     U_ASSERT(!source->hasDeleter());
1036     int32_t destOriginalSize = dest->size();
1037     int32_t sourceSize       = source->size();
1038     int32_t di           = 0;
1039     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
1040     void **destPtr, **sourcePtr;
1041     void **destLim, **sourceLim;
1042 
1043     if (destOriginalSize > destArray.getCapacity()) {
1044         if (destArray.resize(destOriginalSize) == nullptr) {
1045             return;
1046         }
1047     }
1048     destPtr = destArray.getAlias();
1049     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
1050 
1051     if (sourceSize > sourceArray.getCapacity()) {
1052         if (sourceArray.resize(sourceSize) == nullptr) {
1053             return;
1054         }
1055     }
1056     sourcePtr = sourceArray.getAlias();
1057     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1058 
1059     // Avoid multiple "get element" calls by getting the contents into arrays
1060     (void) dest->toArray(destPtr);
1061     (void) source->toArray(sourcePtr);
1062 
1063     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1064     if (U_FAILURE(*fStatus)) {
1065         return;
1066     }
1067 
1068     while (sourcePtr < sourceLim && destPtr < destLim) {
1069         if (*destPtr == *sourcePtr) {
1070             dest->setElementAt(*sourcePtr++, di++);
1071             destPtr++;
1072         }
1073         // This check is required for machines with segmented memory, like i5/OS.
1074         // Direct pointer comparison is not recommended.
1075         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1076             dest->setElementAt(*destPtr++, di++);
1077         }
1078         else { /* *sourcePtr < *destPtr */
1079             dest->setElementAt(*sourcePtr++, di++);
1080         }
1081     }
1082 
1083     // At most one of these two cleanup loops will execute
1084     while (destPtr < destLim) {
1085         dest->setElementAt(*destPtr++, di++);
1086     }
1087     while (sourcePtr < sourceLim) {
1088         dest->setElementAt(*sourcePtr++, di++);
1089     }
1090 
1091     dest->setSize(di, *fStatus);
1092 }
1093 
1094 
1095 
1096 //-----------------------------------------------------------------------------
1097 //
1098 //  setEqual    Set operation on UVector.
1099 //              Compare for equality.
1100 //              Elements must be sorted.
1101 //
1102 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1103 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1104     return a->equals(*b);
1105 }
1106 
1107 
1108 //-----------------------------------------------------------------------------
1109 //
1110 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1111 //                 for each node in the tree.
1112 //
1113 //-----------------------------------------------------------------------------
1114 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1115 void RBBITableBuilder::printPosSets(RBBINode *n) {
1116     if (n==nullptr) {
1117         return;
1118     }
1119     printf("\n");
1120     RBBINode::printNodeHeader();
1121     RBBINode::printNode(n);
1122     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"true":"false");
1123 
1124     RBBIDebugPrintf("         firstpos:  ");
1125     printSet(n->fFirstPosSet);
1126 
1127     RBBIDebugPrintf("         lastpos:   ");
1128     printSet(n->fLastPosSet);
1129 
1130     RBBIDebugPrintf("         followpos: ");
1131     printSet(n->fFollowPos);
1132 
1133     printPosSets(n->fLeftChild);
1134     printPosSets(n->fRightChild);
1135 }
1136 #endif
1137 
1138 //
1139 //    findDuplCharClassFrom()
1140 //
findDuplCharClassFrom(IntPair * categories)1141 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1142     int32_t numStates = fDStates->size();
1143     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1144 
1145     for (; categories->first < numCols-1; categories->first++) {
1146         // Note: dictionary & non-dictionary columns cannot be merged.
1147         //       The limitSecond value prevents considering mixed pairs.
1148         //       Dictionary categories are >= DictCategoriesStart.
1149         //       Non dict categories are   <  DictCategoriesStart.
1150         int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1151             fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1152         for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1153             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1154             uint16_t table_base = 0;
1155             uint16_t table_dupl = 1;
1156             for (int32_t state=0; state<numStates; state++) {
1157                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1158                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1159                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1160                 if (table_base != table_dupl) {
1161                     break;
1162                 }
1163             }
1164             if (table_base == table_dupl) {
1165                 return true;
1166             }
1167         }
1168     }
1169     return false;
1170 }
1171 
1172 
1173 //
1174 //    removeColumn()
1175 //
removeColumn(int32_t column)1176 void RBBITableBuilder::removeColumn(int32_t column) {
1177     int32_t numStates = fDStates->size();
1178     for (int32_t state=0; state<numStates; state++) {
1179         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1180         U_ASSERT(column < sd->fDtran->size());
1181         sd->fDtran->removeElementAt(column);
1182     }
1183 }
1184 
1185 /*
1186  * findDuplicateState
1187  */
findDuplicateState(IntPair * states)1188 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1189     int32_t numStates = fDStates->size();
1190     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1191 
1192     for (; states->first<numStates-1; states->first++) {
1193         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1194         for (states->second=states->first+1; states->second<numStates; states->second++) {
1195             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1196             if (firstSD->fAccepting != duplSD->fAccepting ||
1197                 firstSD->fLookAhead != duplSD->fLookAhead ||
1198                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1199                 continue;
1200             }
1201             bool rowsMatch = true;
1202             for (int32_t col=0; col < numCols; ++col) {
1203                 int32_t firstVal = firstSD->fDtran->elementAti(col);
1204                 int32_t duplVal = duplSD->fDtran->elementAti(col);
1205                 if (!((firstVal == duplVal) ||
1206                         ((firstVal == states->first || firstVal == states->second) &&
1207                         (duplVal  == states->first || duplVal  == states->second)))) {
1208                     rowsMatch = false;
1209                     break;
1210                 }
1211             }
1212             if (rowsMatch) {
1213                 return true;
1214             }
1215         }
1216     }
1217     return false;
1218 }
1219 
1220 
findDuplicateSafeState(IntPair * states)1221 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1222     int32_t numStates = fSafeTable->size();
1223 
1224     for (; states->first<numStates-1; states->first++) {
1225         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1226         for (states->second=states->first+1; states->second<numStates; states->second++) {
1227             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1228             bool rowsMatch = true;
1229             int32_t numCols = firstRow->length();
1230             for (int32_t col=0; col < numCols; ++col) {
1231                 int32_t firstVal = firstRow->charAt(col);
1232                 int32_t duplVal = duplRow->charAt(col);
1233                 if (!((firstVal == duplVal) ||
1234                         ((firstVal == states->first || firstVal == states->second) &&
1235                         (duplVal  == states->first || duplVal  == states->second)))) {
1236                     rowsMatch = false;
1237                     break;
1238                 }
1239             }
1240             if (rowsMatch) {
1241                 return true;
1242             }
1243         }
1244     }
1245     return false;
1246 }
1247 
1248 
removeState(IntPair duplStates)1249 void RBBITableBuilder::removeState(IntPair duplStates) {
1250     const int32_t keepState = duplStates.first;
1251     const int32_t duplState = duplStates.second;
1252     U_ASSERT(keepState < duplState);
1253     U_ASSERT(duplState < fDStates->size());
1254 
1255     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1256     fDStates->removeElementAt(duplState);
1257     delete duplSD;
1258 
1259     int32_t numStates = fDStates->size();
1260     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1261     for (int32_t state=0; state<numStates; ++state) {
1262         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1263         for (int32_t col=0; col<numCols; col++) {
1264             int32_t existingVal = sd->fDtran->elementAti(col);
1265             int32_t newVal = existingVal;
1266             if (existingVal == duplState) {
1267                 newVal = keepState;
1268             } else if (existingVal > duplState) {
1269                 newVal = existingVal - 1;
1270             }
1271             sd->fDtran->setElementAt(newVal, col);
1272         }
1273     }
1274 }
1275 
removeSafeState(IntPair duplStates)1276 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1277     const int32_t keepState = duplStates.first;
1278     const int32_t duplState = duplStates.second;
1279     U_ASSERT(keepState < duplState);
1280     U_ASSERT(duplState < fSafeTable->size());
1281 
1282     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1283                                               // and will auto-delete the removed element.
1284     int32_t numStates = fSafeTable->size();
1285     for (int32_t state=0; state<numStates; ++state) {
1286         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1287         int32_t numCols = sd->length();
1288         for (int32_t col=0; col<numCols; col++) {
1289             int32_t existingVal = sd->charAt(col);
1290             int32_t newVal = existingVal;
1291             if (existingVal == duplState) {
1292                 newVal = keepState;
1293             } else if (existingVal > duplState) {
1294                 newVal = existingVal - 1;
1295             }
1296             sd->setCharAt(col, static_cast<char16_t>(newVal));
1297         }
1298     }
1299 }
1300 
1301 
1302 /*
1303  * RemoveDuplicateStates
1304  */
removeDuplicateStates()1305 int32_t RBBITableBuilder::removeDuplicateStates() {
1306     IntPair dupls = {3, 0};
1307     int32_t numStatesRemoved = 0;
1308 
1309     while (findDuplicateState(&dupls)) {
1310         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1311         removeState(dupls);
1312         ++numStatesRemoved;
1313     }
1314     return numStatesRemoved;
1315 }
1316 
1317 
1318 //-----------------------------------------------------------------------------
1319 //
1320 //   getTableSize()    Calculate the size of the runtime form of this
1321 //                     state transition table.
1322 //
1323 //-----------------------------------------------------------------------------
getTableSize() const1324 int32_t  RBBITableBuilder::getTableSize() const {
1325     int32_t    size = 0;
1326     int32_t    numRows;
1327     int32_t    numCols;
1328     int32_t    rowSize;
1329 
1330     if (fTree == nullptr) {
1331         return 0;
1332     }
1333 
1334     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1335 
1336     numRows = fDStates->size();
1337     numCols = fRB->fSetBuilder->getNumCharCategories();
1338 
1339     if (use8BitsForTable()) {
1340         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1341     } else {
1342         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1343     }
1344     size   += numRows * rowSize;
1345     return size;
1346 }
1347 
use8BitsForTable() const1348 bool RBBITableBuilder::use8BitsForTable() const {
1349     return fDStates->size() <= kMaxStateFor8BitsTable;
1350 }
1351 
1352 //-----------------------------------------------------------------------------
1353 //
1354 //   exportTable()    export the state transition table in the format required
1355 //                    by the runtime engine.  getTableSize() bytes of memory
1356 //                    must be available at the output address "where".
1357 //
1358 //-----------------------------------------------------------------------------
exportTable(void * where)1359 void RBBITableBuilder::exportTable(void *where) {
1360     RBBIStateTable    *table = (RBBIStateTable *)where;
1361     uint32_t           state;
1362     int                col;
1363 
1364     if (U_FAILURE(*fStatus) || fTree == nullptr) {
1365         return;
1366     }
1367 
1368     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1369     if (catCount > 0x7fff ||
1370         fDStates->size() > 0x7fff) {
1371         *fStatus = U_BRK_INTERNAL_ERROR;
1372         return;
1373     }
1374 
1375     table->fNumStates = fDStates->size();
1376     table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1377     table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1378     table->fFlags     = 0;
1379     if (use8BitsForTable()) {
1380         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1381         table->fFlags  |= RBBI_8BITS_ROWS;
1382     } else {
1383         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1384     }
1385     if (fRB->fLookAheadHardBreak) {
1386         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1387     }
1388     if (fRB->fSetBuilder->sawBOF()) {
1389         table->fFlags  |= RBBI_BOF_REQUIRED;
1390     }
1391 
1392     for (state=0; state<table->fNumStates; state++) {
1393         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1394         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1395         if (use8BitsForTable()) {
1396             U_ASSERT (sd->fAccepting <= 255);
1397             U_ASSERT (sd->fLookAhead <= 255);
1398             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1399             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1400             r8->fAccepting = sd->fAccepting;
1401             r8->fLookAhead = sd->fLookAhead;
1402             r8->fTagsIdx   = sd->fTagsIdx;
1403             for (col=0; col<catCount; col++) {
1404                 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1405                 r8->fNextState[col] = sd->fDtran->elementAti(col);
1406             }
1407         } else {
1408             U_ASSERT (sd->fAccepting <= 0xffff);
1409             U_ASSERT (sd->fLookAhead <= 0xffff);
1410             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1411             row->r16.fAccepting = sd->fAccepting;
1412             row->r16.fLookAhead = sd->fLookAhead;
1413             row->r16.fTagsIdx   = sd->fTagsIdx;
1414             for (col=0; col<catCount; col++) {
1415                 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1416             }
1417         }
1418     }
1419 }
1420 
1421 
1422 /**
1423  *   Synthesize a safe state table from the main state table.
1424  */
buildSafeReverseTable(UErrorCode & status)1425 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1426     // The safe table creation has three steps:
1427 
1428     // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1429     // following the pair do not depend on context or state before the pair. To test
1430     // whether a pair is safe, run it through the main forward state table, starting
1431     // from each state. If the the final state is the same, no matter what the starting state,
1432     // the pair is safe.
1433     //
1434     // 2. Build a state table that recognizes the safe pairs. It's similar to their
1435     // forward table, with a column for each input character [class], and a row for
1436     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1437     // create an additional state for each input character category; being in
1438     // one of these states means that the character has been seen, and is potentially
1439     // the first of a pair. In each of these rows, the entry for the second character
1440     // of a safe pair is set to the stop state (0), indicating that a match was found.
1441     // All other table entries are set to the state corresponding the current input
1442     // character, allowing that character to be the of a start following pair.
1443     //
1444     // Because the safe rules are to be run in reverse, moving backwards in the text,
1445     // the first and second pair categories are swapped when building the table.
1446     //
1447     // 3. Compress the table. There are typically many rows (states) that are
1448     // equivalent - that have zeroes (match completed) in the same columns -
1449     // and can be folded together.
1450 
1451     // Each safe pair is stored as two UChars in the safePair string.
1452     UnicodeString safePairs;
1453 
1454     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1455     int32_t numStates = fDStates->size();
1456 
1457     for (int32_t c1=0; c1<numCharClasses; ++c1) {
1458         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1459             int32_t wantedEndState = -1;
1460             int32_t endState = 0;
1461             for (int32_t startState = 1; startState < numStates; ++startState) {
1462                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1463                 int32_t s2 = startStateD->fDtran->elementAti(c1);
1464                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1465                 endState = s2StateD->fDtran->elementAti(c2);
1466                 if (wantedEndState < 0) {
1467                     wantedEndState = endState;
1468                 } else {
1469                     if (wantedEndState != endState) {
1470                         break;
1471                     }
1472                 }
1473             }
1474             if (wantedEndState == endState) {
1475                 safePairs.append((char16_t)c1);
1476                 safePairs.append((char16_t)c2);
1477                 // printf("(%d, %d) ", c1, c2);
1478             }
1479         }
1480         // printf("\n");
1481     }
1482 
1483     // Populate the initial safe table.
1484     // The table as a whole is UVector<UnicodeString>
1485     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1486     // Row 0 is the stop state.
1487     // Row 1 is the start state.
1488     // Row 2 and beyond are other states, initially one per char class, but
1489     //   after initial construction, many of the states will be combined, compacting the table.
1490     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1491     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1492 
1493     U_ASSERT(fSafeTable == nullptr);
1494     LocalPointer<UVector> lpSafeTable(
1495         new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1496     if (U_FAILURE(status)) {
1497         return;
1498     }
1499     fSafeTable = lpSafeTable.orphan();
1500     for (int32_t row=0; row<numCharClasses + 2; ++row) {
1501         LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1502         fSafeTable->adoptElement(lpString.orphan(), status);
1503     }
1504     if (U_FAILURE(status)) {
1505         return;
1506     }
1507 
1508     // From the start state, each input char class transitions to the state for that input.
1509     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1510     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1511         // Note: +2 for the start & stop state.
1512         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1513     }
1514 
1515     // Initially make every other state table row look like the start state row,
1516     for (int32_t row=2; row<numCharClasses+2; ++row) {
1517         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1518         rowState = startState;   // UnicodeString assignment, copies contents.
1519     }
1520 
1521     // Run through the safe pairs, set the next state to zero when pair has been seen.
1522     // Zero being the stop state, meaning we found a safe point.
1523     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1524         int32_t c1 = safePairs.charAt(pairIdx);
1525         int32_t c2 = safePairs.charAt(pairIdx + 1);
1526 
1527         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1528         rowState.setCharAt(c1, 0);
1529     }
1530 
1531     // Remove duplicate or redundant rows from the table.
1532     IntPair states = {1, 0};
1533     while (findDuplicateSafeState(&states)) {
1534         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1535         removeSafeState(states);
1536     }
1537 }
1538 
1539 
1540 //-----------------------------------------------------------------------------
1541 //
1542 //   getSafeTableSize()    Calculate the size of the runtime form of this
1543 //                         safe state table.
1544 //
1545 //-----------------------------------------------------------------------------
getSafeTableSize() const1546 int32_t  RBBITableBuilder::getSafeTableSize() const {
1547     int32_t    size = 0;
1548     int32_t    numRows;
1549     int32_t    numCols;
1550     int32_t    rowSize;
1551 
1552     if (fSafeTable == nullptr) {
1553         return 0;
1554     }
1555 
1556     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1557 
1558     numRows = fSafeTable->size();
1559     numCols = fRB->fSetBuilder->getNumCharCategories();
1560 
1561     if (use8BitsForSafeTable()) {
1562         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1563     } else {
1564         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1565     }
1566     size   += numRows * rowSize;
1567     return size;
1568 }
1569 
use8BitsForSafeTable() const1570 bool RBBITableBuilder::use8BitsForSafeTable() const {
1571     return fSafeTable->size() <= kMaxStateFor8BitsTable;
1572 }
1573 
1574 //-----------------------------------------------------------------------------
1575 //
1576 //   exportSafeTable()   export the state transition table in the format required
1577 //                       by the runtime engine.  getTableSize() bytes of memory
1578 //                       must be available at the output address "where".
1579 //
1580 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1581 void RBBITableBuilder::exportSafeTable(void *where) {
1582     RBBIStateTable    *table = (RBBIStateTable *)where;
1583     uint32_t           state;
1584     int                col;
1585 
1586     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1587         return;
1588     }
1589 
1590     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1591     if (catCount > 0x7fff ||
1592             fSafeTable->size() > 0x7fff) {
1593         *fStatus = U_BRK_INTERNAL_ERROR;
1594         return;
1595     }
1596 
1597     table->fNumStates = fSafeTable->size();
1598     table->fFlags     = 0;
1599     if (use8BitsForSafeTable()) {
1600         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1601         table->fFlags  |= RBBI_8BITS_ROWS;
1602     } else {
1603         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1604     }
1605 
1606     for (state=0; state<table->fNumStates; state++) {
1607         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1608         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1609         if (use8BitsForSafeTable()) {
1610             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1611             r8->fAccepting = 0;
1612             r8->fLookAhead = 0;
1613             r8->fTagsIdx    = 0;
1614             for (col=0; col<catCount; col++) {
1615                 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1616                 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1617             }
1618         } else {
1619             row->r16.fAccepting = 0;
1620             row->r16.fLookAhead = 0;
1621             row->r16.fTagsIdx    = 0;
1622             for (col=0; col<catCount; col++) {
1623                 row->r16.fNextState[col] = rowString->charAt(col);
1624             }
1625         }
1626     }
1627 }
1628 
1629 
1630 
1631 
1632 //-----------------------------------------------------------------------------
1633 //
1634 //   printSet    Debug function.   Print the contents of a UVector
1635 //
1636 //-----------------------------------------------------------------------------
1637 #ifdef RBBI_DEBUG
printSet(UVector * s)1638 void RBBITableBuilder::printSet(UVector *s) {
1639     int32_t  i;
1640     for (i=0; i<s->size(); i++) {
1641         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1642         RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum);
1643     }
1644     RBBIDebugPrintf("\n");
1645 }
1646 #endif
1647 
1648 
1649 //-----------------------------------------------------------------------------
1650 //
1651 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1652 //
1653 //-----------------------------------------------------------------------------
1654 #ifdef RBBI_DEBUG
printStates()1655 void RBBITableBuilder::printStates() {
1656     int     c;    // input "character"
1657     int     n;    // state number
1658 
1659     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1660     RBBIDebugPrintf("      | Acc  LA    Tag");
1661     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1662         RBBIDebugPrintf(" %3d", c);
1663     }
1664     RBBIDebugPrintf("\n");
1665     RBBIDebugPrintf("      |---------------");
1666     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1667         RBBIDebugPrintf("----");
1668     }
1669     RBBIDebugPrintf("\n");
1670 
1671     for (n=0; n<fDStates->size(); n++) {
1672         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1673         RBBIDebugPrintf("  %3d | " , n);
1674         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1675         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1676             RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1677         }
1678         RBBIDebugPrintf("\n");
1679     }
1680     RBBIDebugPrintf("\n\n");
1681 }
1682 #endif
1683 
1684 
1685 //-----------------------------------------------------------------------------
1686 //
1687 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1688 //
1689 //-----------------------------------------------------------------------------
1690 #ifdef RBBI_DEBUG
printReverseTable()1691 void RBBITableBuilder::printReverseTable() {
1692     int     c;    // input "character"
1693     int     n;    // state number
1694 
1695     RBBIDebugPrintf("    Safe Reverse Table \n");
1696     if (fSafeTable == nullptr) {
1697         RBBIDebugPrintf("   --- nullptr ---\n");
1698         return;
1699     }
1700     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1701     RBBIDebugPrintf("      | Acc  LA    Tag");
1702     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1703         RBBIDebugPrintf(" %2d", c);
1704     }
1705     RBBIDebugPrintf("\n");
1706     RBBIDebugPrintf("      |---------------");
1707     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1708         RBBIDebugPrintf("---");
1709     }
1710     RBBIDebugPrintf("\n");
1711 
1712     for (n=0; n<fSafeTable->size(); n++) {
1713         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1714         RBBIDebugPrintf("  %3d | " , n);
1715         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1716         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1717             RBBIDebugPrintf(" %2d", rowString->charAt(c));
1718         }
1719         RBBIDebugPrintf("\n");
1720     }
1721     RBBIDebugPrintf("\n\n");
1722 }
1723 #endif
1724 
1725 
1726 
1727 //-----------------------------------------------------------------------------
1728 //
1729 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1730 //
1731 //-----------------------------------------------------------------------------
1732 #ifdef RBBI_DEBUG
printRuleStatusTable()1733 void RBBITableBuilder::printRuleStatusTable() {
1734     int32_t  thisRecord = 0;
1735     int32_t  nextRecord = 0;
1736     int      i;
1737     UVector  *tbl = fRB->fRuleStatusVals;
1738 
1739     RBBIDebugPrintf("index |  tags \n");
1740     RBBIDebugPrintf("-------------------\n");
1741 
1742     while (nextRecord < tbl->size()) {
1743         thisRecord = nextRecord;
1744         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1745         RBBIDebugPrintf("%4d   ", thisRecord);
1746         for (i=thisRecord+1; i<nextRecord; i++) {
1747             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1748         }
1749         RBBIDebugPrintf("\n");
1750     }
1751     RBBIDebugPrintf("\n\n");
1752 }
1753 #endif
1754 
1755 
1756 //-----------------------------------------------------------------------------
1757 //
1758 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1759 //                           Most access is directly to the fields.
1760 //
1761 //-----------------------------------------------------------------------------
1762 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1763 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1764     fMarked    = false;
1765     fAccepting = 0;
1766     fLookAhead = 0;
1767     fTagsIdx   = 0;
1768     fTagVals   = nullptr;
1769     fPositions = nullptr;
1770     fDtran     = nullptr;
1771 
1772     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1773     if (U_FAILURE(*fStatus)) {
1774         return;
1775     }
1776     if (fDtran == nullptr) {
1777         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1778         return;
1779     }
1780     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1781                                            //   It is indexed by input symbols, and will
1782                                            //   hold  the next state number for each
1783                                            //   symbol.
1784 }
1785 
1786 
~RBBIStateDescriptor()1787 RBBIStateDescriptor::~RBBIStateDescriptor() {
1788     delete       fPositions;
1789     delete       fDtran;
1790     delete       fTagVals;
1791     fPositions = nullptr;
1792     fDtran     = nullptr;
1793     fTagVals   = nullptr;
1794 }
1795 
1796 U_NAMESPACE_END
1797 
1798 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1799