1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28
29 U_NAMESPACE_BEGIN
30
31 const int32_t kMaxStateFor8BitsTable = 255;
32
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34 fRB(rb),
35 fTree(*rootNode),
36 fStatus(&status),
37 fDStates(nullptr),
38 fSafeTable(nullptr) {
39 if (U_FAILURE(status)) {
40 return;
41 }
42 // fDStates is UVector<RBBIStateDescriptor *>
43 fDStates = new UVector(status);
44 if (U_SUCCESS(status) && fDStates == nullptr ) {
45 status = U_MEMORY_ALLOCATION_ERROR;
46 }
47 }
48
49
50
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52 int i;
53 for (i=0; i<fDStates->size(); i++) {
54 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55 }
56 delete fDStates;
57 delete fSafeTable;
58 delete fLookAheadRuleMap;
59 }
60
61
62 //-----------------------------------------------------------------------------
63 //
64 // RBBITableBuilder::buildForwardTable - This is the main function for building
65 // the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void RBBITableBuilder::buildForwardTable() {
69
70 if (U_FAILURE(*fStatus)) {
71 return;
72 }
73
74 // If there were no rules, just return. This situation can easily arise
75 // for the reverse rules.
76 if (fTree==NULL) {
77 return;
78 }
79
80 //
81 // Walk through the tree, replacing any references to $variables with a copy of the
82 // parse tree for the substition expression.
83 //
84 fTree = fTree->flattenVariables();
85 #ifdef RBBI_DEBUG
86 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
87 RBBIDebugPuts("\nParse tree after flattening variable references.");
88 RBBINode::printTree(fTree, TRUE);
89 }
90 #endif
91
92 //
93 // If the rules contained any references to {bof}
94 // add a {bof} <cat> <former root of tree> to the
95 // tree. Means that all matches must start out with the
96 // {bof} fake character.
97 //
98 if (fRB->fSetBuilder->sawBOF()) {
99 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
100 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
101 // Delete and exit if memory allocation failed.
102 if (bofTop == NULL || bofLeaf == NULL) {
103 *fStatus = U_MEMORY_ALLOCATION_ERROR;
104 delete bofTop;
105 delete bofLeaf;
106 return;
107 }
108 bofTop->fLeftChild = bofLeaf;
109 bofTop->fRightChild = fTree;
110 bofLeaf->fParent = bofTop;
111 bofLeaf->fVal = 2; // Reserved value for {bof}.
112 fTree = bofTop;
113 }
114
115 //
116 // Add a unique right-end marker to the expression.
117 // Appears as a cat-node, left child being the original tree,
118 // right child being the end marker.
119 //
120 RBBINode *cn = new RBBINode(RBBINode::opCat);
121 // Exit if memory allocation failed.
122 if (cn == NULL) {
123 *fStatus = U_MEMORY_ALLOCATION_ERROR;
124 return;
125 }
126 cn->fLeftChild = fTree;
127 fTree->fParent = cn;
128 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
129 // Delete and exit if memory allocation failed.
130 if (cn->fRightChild == NULL) {
131 *fStatus = U_MEMORY_ALLOCATION_ERROR;
132 delete cn;
133 return;
134 }
135 cn->fRightChild->fParent = cn;
136 fTree = cn;
137
138 //
139 // Replace all references to UnicodeSets with the tree for the equivalent
140 // expression.
141 //
142 fTree->flattenSets();
143 #ifdef RBBI_DEBUG
144 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
145 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
146 RBBINode::printTree(fTree, TRUE);
147 }
148 #endif
149
150
151 //
152 // calculate the functions nullable, firstpos, lastpos and followpos on
153 // nodes in the parse tree.
154 // See the algorithm description in Aho.
155 // Understanding how this works by looking at the code alone will be
156 // nearly impossible.
157 //
158 calcNullable(fTree);
159 calcFirstPos(fTree);
160 calcLastPos(fTree);
161 calcFollowPos(fTree);
162 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
163 RBBIDebugPuts("\n");
164 printPosSets(fTree);
165 }
166
167 //
168 // For "chained" rules, modify the followPos sets
169 //
170 if (fRB->fChainRules) {
171 calcChainedFollowPos(fTree, endMarkerNode);
172 }
173
174 //
175 // BOF (start of input) test fixup.
176 //
177 if (fRB->fSetBuilder->sawBOF()) {
178 bofFixup();
179 }
180
181 //
182 // Build the DFA state transition tables.
183 //
184 buildStateTable();
185 mapLookAheadRules();
186 flagAcceptingStates();
187 flagLookAheadStates();
188 flagTaggedStates();
189
190 //
191 // Update the global table of rule status {tag} values
192 // The rule builder has a global vector of status values that are common
193 // for all tables. Merge the ones from this table into the global set.
194 //
195 mergeRuleStatusVals();
196 }
197
198
199
200 //-----------------------------------------------------------------------------
201 //
202 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
203 //
204 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)205 void RBBITableBuilder::calcNullable(RBBINode *n) {
206 if (n == NULL) {
207 return;
208 }
209 if (n->fType == RBBINode::setRef ||
210 n->fType == RBBINode::endMark ) {
211 // These are non-empty leaf node types.
212 n->fNullable = FALSE;
213 return;
214 }
215
216 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
217 // Lookahead marker node. It's a leaf, so no recursion on children.
218 // It's nullable because it does not match any literal text from the input stream.
219 n->fNullable = TRUE;
220 return;
221 }
222
223
224 // The node is not a leaf.
225 // Calculate nullable on its children.
226 calcNullable(n->fLeftChild);
227 calcNullable(n->fRightChild);
228
229 // Apply functions from table 3.40 in Aho
230 if (n->fType == RBBINode::opOr) {
231 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
232 }
233 else if (n->fType == RBBINode::opCat) {
234 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
235 }
236 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
237 n->fNullable = TRUE;
238 }
239 else {
240 n->fNullable = FALSE;
241 }
242 }
243
244
245
246
247 //-----------------------------------------------------------------------------
248 //
249 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
250 //
251 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)252 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
253 if (n == NULL) {
254 return;
255 }
256 if (n->fType == RBBINode::leafChar ||
257 n->fType == RBBINode::endMark ||
258 n->fType == RBBINode::lookAhead ||
259 n->fType == RBBINode::tag) {
260 // These are non-empty leaf node types.
261 // Note: In order to maintain the sort invariant on the set,
262 // this function should only be called on a node whose set is
263 // empty to start with.
264 n->fFirstPosSet->addElement(n, *fStatus);
265 return;
266 }
267
268 // The node is not a leaf.
269 // Calculate firstPos on its children.
270 calcFirstPos(n->fLeftChild);
271 calcFirstPos(n->fRightChild);
272
273 // Apply functions from table 3.40 in Aho
274 if (n->fType == RBBINode::opOr) {
275 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 else if (n->fType == RBBINode::opCat) {
279 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
280 if (n->fLeftChild->fNullable) {
281 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
282 }
283 }
284 else if (n->fType == RBBINode::opStar ||
285 n->fType == RBBINode::opQuestion ||
286 n->fType == RBBINode::opPlus) {
287 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
288 }
289 }
290
291
292
293 //-----------------------------------------------------------------------------
294 //
295 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
296 //
297 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)298 void RBBITableBuilder::calcLastPos(RBBINode *n) {
299 if (n == NULL) {
300 return;
301 }
302 if (n->fType == RBBINode::leafChar ||
303 n->fType == RBBINode::endMark ||
304 n->fType == RBBINode::lookAhead ||
305 n->fType == RBBINode::tag) {
306 // These are non-empty leaf node types.
307 // Note: In order to maintain the sort invariant on the set,
308 // this function should only be called on a node whose set is
309 // empty to start with.
310 n->fLastPosSet->addElement(n, *fStatus);
311 return;
312 }
313
314 // The node is not a leaf.
315 // Calculate lastPos on its children.
316 calcLastPos(n->fLeftChild);
317 calcLastPos(n->fRightChild);
318
319 // Apply functions from table 3.40 in Aho
320 if (n->fType == RBBINode::opOr) {
321 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323 }
324 else if (n->fType == RBBINode::opCat) {
325 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326 if (n->fRightChild->fNullable) {
327 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328 }
329 }
330 else if (n->fType == RBBINode::opStar ||
331 n->fType == RBBINode::opQuestion ||
332 n->fType == RBBINode::opPlus) {
333 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
334 }
335 }
336
337
338
339 //-----------------------------------------------------------------------------
340 //
341 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
342 //
343 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)344 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
345 if (n == NULL ||
346 n->fType == RBBINode::leafChar ||
347 n->fType == RBBINode::endMark) {
348 return;
349 }
350
351 calcFollowPos(n->fLeftChild);
352 calcFollowPos(n->fRightChild);
353
354 // Aho rule #1
355 if (n->fType == RBBINode::opCat) {
356 RBBINode *i; // is 'i' in Aho's description
357 uint32_t ix;
358
359 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
360
361 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
362 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
363 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
364 }
365 }
366
367 // Aho rule #2
368 if (n->fType == RBBINode::opStar ||
369 n->fType == RBBINode::opPlus) {
370 RBBINode *i; // again, n and i are the names from Aho's description.
371 uint32_t ix;
372
373 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
374 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
375 setAdd(i->fFollowPos, n->fFirstPosSet);
376 }
377 }
378
379
380
381 }
382
383 //-----------------------------------------------------------------------------
384 //
385 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
386 // as roots of a rule to a destination vector.
387 //
388 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)389 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
390 if (node == NULL || U_FAILURE(*fStatus)) {
391 return;
392 }
393 if (node->fRuleRoot) {
394 dest->addElement(node, *fStatus);
395 // Note: rules cannot nest. If we found a rule start node,
396 // no child node can also be a start node.
397 return;
398 }
399 addRuleRootNodes(dest, node->fLeftChild);
400 addRuleRootNodes(dest, node->fRightChild);
401 }
402
403 //-----------------------------------------------------------------------------
404 //
405 // calcChainedFollowPos. Modify the previously calculated followPos sets
406 // to implement rule chaining. NOT described by Aho
407 //
408 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)409 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
410
411 UVector leafNodes(*fStatus);
412 if (U_FAILURE(*fStatus)) {
413 return;
414 }
415
416 // get a list all leaf nodes
417 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
418 if (U_FAILURE(*fStatus)) {
419 return;
420 }
421
422 // Collect all leaf nodes that can start matches for rules
423 // with inbound chaining enabled, which is the union of the
424 // firstPosition sets from each of the rule root nodes.
425
426 UVector ruleRootNodes(*fStatus);
427 addRuleRootNodes(&ruleRootNodes, tree);
428
429 UVector matchStartNodes(*fStatus);
430 for (int j=0; j<ruleRootNodes.size(); ++j) {
431 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
432 if (node->fChainIn) {
433 setAdd(&matchStartNodes, node->fFirstPosSet);
434 }
435 }
436 if (U_FAILURE(*fStatus)) {
437 return;
438 }
439
440 int32_t endNodeIx;
441 int32_t startNodeIx;
442
443 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
444 RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
445
446 // Identify leaf nodes that correspond to overall rule match positions.
447 // These include the endMarkNode in their followPos sets.
448 //
449 // Note: do not consider other end marker nodes, those that are added to
450 // look-ahead rules. These can't chain; a match immediately stops
451 // further matching. This leaves exactly one end marker node, the one
452 // at the end of the complete tree.
453
454 if (!endNode->fFollowPos->contains(endMarkNode)) {
455 continue;
456 }
457
458 // We've got a node that can end a match.
459
460 // !!LBCMNoChain implementation: If this node's val correspond to
461 // the Line Break $CM char class, don't chain from it.
462 // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
463 // by any of the standard ICU rules.
464 if (fRB->fLBCMNoChain) {
465 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
466 if (c != -1) {
467 // c == -1 occurs with sets containing only the {eof} marker string.
468 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
469 if (cLBProp == U_LB_COMBINING_MARK) {
470 continue;
471 }
472 }
473 }
474
475 // Now iterate over the nodes that can start a match, looking for ones
476 // with the same char class as our ending node.
477 RBBINode *startNode;
478 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
479 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
480 if (startNode->fType != RBBINode::leafChar) {
481 continue;
482 }
483
484 if (endNode->fVal == startNode->fVal) {
485 // The end val (character class) of one possible match is the
486 // same as the start of another.
487
488 // Add all nodes from the followPos of the start node to the
489 // followPos set of the end node, which will have the effect of
490 // letting matches transition from a match state at endNode
491 // to the second char of a match starting with startNode.
492 setAdd(endNode->fFollowPos, startNode->fFollowPos);
493 }
494 }
495 }
496 }
497
498
499 //-----------------------------------------------------------------------------
500 //
501 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
502 // Do an swizzle similar to chaining, modifying the followPos set of
503 // the bofNode to include the followPos nodes from other {bot} nodes
504 // scattered through the tree.
505 //
506 // This function has much in common with calcChainedFollowPos().
507 //
508 //-----------------------------------------------------------------------------
bofFixup()509 void RBBITableBuilder::bofFixup() {
510
511 if (U_FAILURE(*fStatus)) {
512 return;
513 }
514
515 // The parse tree looks like this ...
516 // fTree root ---> <cat>
517 // / \ .
518 // <cat> <#end node>
519 // / \ .
520 // <bofNode> rest
521 // of tree
522 //
523 // We will be adding things to the followPos set of the <bofNode>
524 //
525 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
526 U_ASSERT(bofNode->fType == RBBINode::leafChar);
527 U_ASSERT(bofNode->fVal == 2);
528
529 // Get all nodes that can be the start a match of the user-written rules
530 // (excluding the fake bofNode)
531 // We want the nodes that can start a match in the
532 // part labeled "rest of tree"
533 //
534 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
535
536 RBBINode *startNode;
537 int startNodeIx;
538 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
539 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
540 if (startNode->fType != RBBINode::leafChar) {
541 continue;
542 }
543
544 if (startNode->fVal == bofNode->fVal) {
545 // We found a leaf node corresponding to a {bof} that was
546 // explicitly written into a rule.
547 // Add everything from the followPos set of this node to the
548 // followPos set of the fake bofNode at the start of the tree.
549 //
550 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
551 }
552 }
553 }
554
555 //-----------------------------------------------------------------------------
556 //
557 // buildStateTable() Determine the set of runtime DFA states and the
558 // transition tables for these states, by the algorithm
559 // of fig. 3.44 in Aho.
560 //
561 // Most of the comments are quotes of Aho's psuedo-code.
562 //
563 //-----------------------------------------------------------------------------
buildStateTable()564 void RBBITableBuilder::buildStateTable() {
565 if (U_FAILURE(*fStatus)) {
566 return;
567 }
568 RBBIStateDescriptor *failState;
569 // Set it to NULL to avoid uninitialized warning
570 RBBIStateDescriptor *initialState = NULL;
571 //
572 // Add a dummy state 0 - the stop state. Not from Aho.
573 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
574 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
575 if (failState == NULL) {
576 *fStatus = U_MEMORY_ALLOCATION_ERROR;
577 goto ExitBuildSTdeleteall;
578 }
579 failState->fPositions = new UVector(*fStatus);
580 if (failState->fPositions == NULL) {
581 *fStatus = U_MEMORY_ALLOCATION_ERROR;
582 }
583 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
584 goto ExitBuildSTdeleteall;
585 }
586 fDStates->addElement(failState, *fStatus);
587 if (U_FAILURE(*fStatus)) {
588 goto ExitBuildSTdeleteall;
589 }
590
591 // initially, the only unmarked state in Dstates is firstpos(root),
592 // where toot is the root of the syntax tree for (r)#;
593 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
594 if (initialState == NULL) {
595 *fStatus = U_MEMORY_ALLOCATION_ERROR;
596 }
597 if (U_FAILURE(*fStatus)) {
598 goto ExitBuildSTdeleteall;
599 }
600 initialState->fPositions = new UVector(*fStatus);
601 if (initialState->fPositions == NULL) {
602 *fStatus = U_MEMORY_ALLOCATION_ERROR;
603 }
604 if (U_FAILURE(*fStatus)) {
605 goto ExitBuildSTdeleteall;
606 }
607 setAdd(initialState->fPositions, fTree->fFirstPosSet);
608 fDStates->addElement(initialState, *fStatus);
609 if (U_FAILURE(*fStatus)) {
610 goto ExitBuildSTdeleteall;
611 }
612
613 // while there is an unmarked state T in Dstates do begin
614 for (;;) {
615 RBBIStateDescriptor *T = NULL;
616 int32_t tx;
617 for (tx=1; tx<fDStates->size(); tx++) {
618 RBBIStateDescriptor *temp;
619 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
620 if (temp->fMarked == FALSE) {
621 T = temp;
622 break;
623 }
624 }
625 if (T == NULL) {
626 break;
627 }
628
629 // mark T;
630 T->fMarked = TRUE;
631
632 // for each input symbol a do begin
633 int32_t a;
634 for (a = 1; a<=lastInputSymbol; a++) {
635 // let U be the set of positions that are in followpos(p)
636 // for some position p in T
637 // such that the symbol at position p is a;
638 UVector *U = NULL;
639 RBBINode *p;
640 int32_t px;
641 for (px=0; px<T->fPositions->size(); px++) {
642 p = (RBBINode *)T->fPositions->elementAt(px);
643 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
644 if (U == NULL) {
645 U = new UVector(*fStatus);
646 if (U == NULL) {
647 *fStatus = U_MEMORY_ALLOCATION_ERROR;
648 goto ExitBuildSTdeleteall;
649 }
650 }
651 setAdd(U, p->fFollowPos);
652 }
653 }
654
655 // if U is not empty and not in DStates then
656 int32_t ux = 0;
657 UBool UinDstates = FALSE;
658 if (U != NULL) {
659 U_ASSERT(U->size() > 0);
660 int ix;
661 for (ix=0; ix<fDStates->size(); ix++) {
662 RBBIStateDescriptor *temp2;
663 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
664 if (setEquals(U, temp2->fPositions)) {
665 delete U;
666 U = temp2->fPositions;
667 ux = ix;
668 UinDstates = TRUE;
669 break;
670 }
671 }
672
673 // Add U as an unmarked state to Dstates
674 if (!UinDstates)
675 {
676 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
677 if (newState == NULL) {
678 *fStatus = U_MEMORY_ALLOCATION_ERROR;
679 }
680 if (U_FAILURE(*fStatus)) {
681 goto ExitBuildSTdeleteall;
682 }
683 newState->fPositions = U;
684 fDStates->addElement(newState, *fStatus);
685 if (U_FAILURE(*fStatus)) {
686 return;
687 }
688 ux = fDStates->size()-1;
689 }
690
691 // Dtran[T, a] := U;
692 T->fDtran->setElementAt(ux, a);
693 }
694 }
695 }
696 return;
697 // delete local pointers only if error occured.
698 ExitBuildSTdeleteall:
699 delete initialState;
700 delete failState;
701 }
702
703
704 /**
705 * mapLookAheadRules
706 *
707 */
mapLookAheadRules()708 void RBBITableBuilder::mapLookAheadRules() {
709 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
710 if (fLookAheadRuleMap == nullptr) {
711 *fStatus = U_MEMORY_ALLOCATION_ERROR;
712 }
713 if (U_FAILURE(*fStatus)) {
714 return;
715 }
716 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
717
718 for (int32_t n=0; n<fDStates->size(); n++) {
719 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
720 int32_t laSlotForState = 0;
721
722 // Establish the look-ahead slot for this state, if the state covers
723 // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
724
725 // If any of the look-ahead nodes already have a slot assigned, use it,
726 // otherwise assign a new one.
727
728 bool sawLookAheadNode = false;
729 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
730 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
731 if (node->fType != RBBINode::NodeType::lookAhead) {
732 continue;
733 }
734 sawLookAheadNode = true;
735 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
736 U_ASSERT(ruleNum < fLookAheadRuleMap->size());
737 U_ASSERT(ruleNum > 0);
738 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
739 if (laSlot != 0) {
740 if (laSlotForState == 0) {
741 laSlotForState = laSlot;
742 } else {
743 // TODO: figure out if this can fail, change to setting an error code if so.
744 U_ASSERT(laSlot == laSlotForState);
745 }
746 }
747 }
748 if (!sawLookAheadNode) {
749 continue;
750 }
751
752 if (laSlotForState == 0) {
753 laSlotForState = ++fLASlotsInUse;
754 }
755
756 // For each look ahead node covered by this state,
757 // set the mapping from the node's rule number to the look ahead slot.
758 // There can be multiple nodes/rule numbers going to the same la slot.
759
760 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
761 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
762 if (node->fType != RBBINode::NodeType::lookAhead) {
763 continue;
764 }
765 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
766 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
767 (void)existingVal;
768 U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
769 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
770 }
771 }
772
773 }
774
775 //-----------------------------------------------------------------------------
776 //
777 // flagAcceptingStates Identify accepting states.
778 // First get a list of all of the end marker nodes.
779 // Then, for each state s,
780 // if s contains one of the end marker nodes in its list of tree positions then
781 // s is an accepting state.
782 //
783 //-----------------------------------------------------------------------------
flagAcceptingStates()784 void RBBITableBuilder::flagAcceptingStates() {
785 if (U_FAILURE(*fStatus)) {
786 return;
787 }
788 UVector endMarkerNodes(*fStatus);
789 RBBINode *endMarker;
790 int32_t i;
791 int32_t n;
792
793 if (U_FAILURE(*fStatus)) {
794 return;
795 }
796
797 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
798 if (U_FAILURE(*fStatus)) {
799 return;
800 }
801
802 for (i=0; i<endMarkerNodes.size(); i++) {
803 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
804 for (n=0; n<fDStates->size(); n++) {
805 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
806 if (sd->fPositions->indexOf(endMarker) >= 0) {
807 // Any non-zero value for fAccepting means this is an accepting node.
808 // The value is what will be returned to the user as the break status.
809 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
810
811 if (sd->fAccepting==0) {
812 // State hasn't been marked as accepting yet. Do it now.
813 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
814 if (sd->fAccepting == 0) {
815 sd->fAccepting = ACCEPTING_UNCONDITIONAL;
816 }
817 }
818 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
819 // Both lookahead and non-lookahead accepting for this state.
820 // Favor the look-ahead, because a look-ahead match needs to
821 // immediately stop the run-time engine. First match, not longest.
822 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
823 }
824 // implicit else:
825 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
826 }
827 }
828 }
829 }
830
831
832 //-----------------------------------------------------------------------------
833 //
834 // flagLookAheadStates Very similar to flagAcceptingStates, above.
835 //
836 //-----------------------------------------------------------------------------
flagLookAheadStates()837 void RBBITableBuilder::flagLookAheadStates() {
838 if (U_FAILURE(*fStatus)) {
839 return;
840 }
841 UVector lookAheadNodes(*fStatus);
842 RBBINode *lookAheadNode;
843 int32_t i;
844 int32_t n;
845
846 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
847 if (U_FAILURE(*fStatus)) {
848 return;
849 }
850 for (i=0; i<lookAheadNodes.size(); i++) {
851 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
852 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
853
854 for (n=0; n<fDStates->size(); n++) {
855 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
856 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
857 if (positionsIdx >= 0) {
858 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
859 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
860 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
861 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
862 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
863 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
864 // }
865 sd->fLookAhead = lookaheadSlot;
866 }
867 }
868 }
869 }
870
871
872
873
874 //-----------------------------------------------------------------------------
875 //
876 // flagTaggedStates
877 //
878 //-----------------------------------------------------------------------------
flagTaggedStates()879 void RBBITableBuilder::flagTaggedStates() {
880 if (U_FAILURE(*fStatus)) {
881 return;
882 }
883 UVector tagNodes(*fStatus);
884 RBBINode *tagNode;
885 int32_t i;
886 int32_t n;
887
888 if (U_FAILURE(*fStatus)) {
889 return;
890 }
891 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
892 if (U_FAILURE(*fStatus)) {
893 return;
894 }
895 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
896 tagNode = (RBBINode *)tagNodes.elementAt(i);
897
898 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
899 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
900 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
901 sortedAdd(&sd->fTagVals, tagNode->fVal);
902 }
903 }
904 }
905 }
906
907
908
909
910 //-----------------------------------------------------------------------------
911 //
912 // mergeRuleStatusVals
913 //
914 // Update the global table of rule status {tag} values
915 // The rule builder has a global vector of status values that are common
916 // for all tables. Merge the ones from this table into the global set.
917 //
918 //-----------------------------------------------------------------------------
mergeRuleStatusVals()919 void RBBITableBuilder::mergeRuleStatusVals() {
920 //
921 // The basic outline of what happens here is this...
922 //
923 // for each state in this state table
924 // if the status tag list for this state is in the global statuses list
925 // record where and
926 // continue with the next state
927 // else
928 // add the tag list for this state to the global list.
929 //
930 int i;
931 int n;
932
933 // Pre-set a single tag of {0} into the table.
934 // We will need this as a default, for rule sets with no explicit tagging.
935 if (fRB->fRuleStatusVals->size() == 0) {
936 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
937 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
938 }
939
940 // For each state
941 for (n=0; n<fDStates->size(); n++) {
942 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
943 UVector *thisStatesTagValues = sd->fTagVals;
944 if (thisStatesTagValues == NULL) {
945 // No tag values are explicitly associated with this state.
946 // Set the default tag value.
947 sd->fTagsIdx = 0;
948 continue;
949 }
950
951 // There are tag(s) associated with this state.
952 // fTagsIdx will be the index into the global tag list for this state's tag values.
953 // Initial value of -1 flags that we haven't got it set yet.
954 sd->fTagsIdx = -1;
955 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
956 int32_t nextTagGroupStart = 0;
957
958 // Loop runs once per group of tags in the global list
959 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
960 thisTagGroupStart = nextTagGroupStart;
961 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
962 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
963 // The number of tags for this state is different from
964 // the number of tags in this group from the global list.
965 // Continue with the next group from the global list.
966 continue;
967 }
968 // The lengths match, go ahead and compare the actual tag values
969 // between this state and the group from the global list.
970 for (i=0; i<thisStatesTagValues->size(); i++) {
971 if (thisStatesTagValues->elementAti(i) !=
972 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
973 // Mismatch.
974 break;
975 }
976 }
977
978 if (i == thisStatesTagValues->size()) {
979 // We found a set of tag values in the global list that match
980 // those for this state. Use them.
981 sd->fTagsIdx = thisTagGroupStart;
982 break;
983 }
984 }
985
986 if (sd->fTagsIdx == -1) {
987 // No suitable entry in the global tag list already. Add one
988 sd->fTagsIdx = fRB->fRuleStatusVals->size();
989 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
990 for (i=0; i<thisStatesTagValues->size(); i++) {
991 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
992 }
993 }
994 }
995 }
996
997
998
999
1000
1001
1002
1003 //-----------------------------------------------------------------------------
1004 //
1005 // sortedAdd Add a value to a vector of sorted values (ints).
1006 // Do not replicate entries; if the value is already there, do not
1007 // add a second one.
1008 // Lazily create the vector if it does not already exist.
1009 //
1010 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1011 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1012 int32_t i;
1013
1014 if (*vector == NULL) {
1015 *vector = new UVector(*fStatus);
1016 }
1017 if (*vector == NULL || U_FAILURE(*fStatus)) {
1018 return;
1019 }
1020 UVector *vec = *vector;
1021 int32_t vSize = vec->size();
1022 for (i=0; i<vSize; i++) {
1023 int32_t valAtI = vec->elementAti(i);
1024 if (valAtI == val) {
1025 // The value is already in the vector. Don't add it again.
1026 return;
1027 }
1028 if (valAtI > val) {
1029 break;
1030 }
1031 }
1032 vec->insertElementAt(val, i, *fStatus);
1033 }
1034
1035
1036
1037 //-----------------------------------------------------------------------------
1038 //
1039 // setAdd Set operation on UVector
1040 // dest = dest union source
1041 // Elements may only appear once and must be sorted.
1042 //
1043 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1044 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1045 int32_t destOriginalSize = dest->size();
1046 int32_t sourceSize = source->size();
1047 int32_t di = 0;
1048 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
1049 void **destPtr, **sourcePtr;
1050 void **destLim, **sourceLim;
1051
1052 if (destOriginalSize > destArray.getCapacity()) {
1053 if (destArray.resize(destOriginalSize) == NULL) {
1054 return;
1055 }
1056 }
1057 destPtr = destArray.getAlias();
1058 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1059
1060 if (sourceSize > sourceArray.getCapacity()) {
1061 if (sourceArray.resize(sourceSize) == NULL) {
1062 return;
1063 }
1064 }
1065 sourcePtr = sourceArray.getAlias();
1066 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1067
1068 // Avoid multiple "get element" calls by getting the contents into arrays
1069 (void) dest->toArray(destPtr);
1070 (void) source->toArray(sourcePtr);
1071
1072 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1073
1074 while (sourcePtr < sourceLim && destPtr < destLim) {
1075 if (*destPtr == *sourcePtr) {
1076 dest->setElementAt(*sourcePtr++, di++);
1077 destPtr++;
1078 }
1079 // This check is required for machines with segmented memory, like i5/OS.
1080 // Direct pointer comparison is not recommended.
1081 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1082 dest->setElementAt(*destPtr++, di++);
1083 }
1084 else { /* *sourcePtr < *destPtr */
1085 dest->setElementAt(*sourcePtr++, di++);
1086 }
1087 }
1088
1089 // At most one of these two cleanup loops will execute
1090 while (destPtr < destLim) {
1091 dest->setElementAt(*destPtr++, di++);
1092 }
1093 while (sourcePtr < sourceLim) {
1094 dest->setElementAt(*sourcePtr++, di++);
1095 }
1096
1097 dest->setSize(di, *fStatus);
1098 }
1099
1100
1101
1102 //-----------------------------------------------------------------------------
1103 //
1104 // setEqual Set operation on UVector.
1105 // Compare for equality.
1106 // Elements must be sorted.
1107 //
1108 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1109 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1110 return a->equals(*b);
1111 }
1112
1113
1114 //-----------------------------------------------------------------------------
1115 //
1116 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1117 // for each node in the tree.
1118 //
1119 //-----------------------------------------------------------------------------
1120 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1121 void RBBITableBuilder::printPosSets(RBBINode *n) {
1122 if (n==NULL) {
1123 return;
1124 }
1125 printf("\n");
1126 RBBINode::printNodeHeader();
1127 RBBINode::printNode(n);
1128 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1129
1130 RBBIDebugPrintf(" firstpos: ");
1131 printSet(n->fFirstPosSet);
1132
1133 RBBIDebugPrintf(" lastpos: ");
1134 printSet(n->fLastPosSet);
1135
1136 RBBIDebugPrintf(" followpos: ");
1137 printSet(n->fFollowPos);
1138
1139 printPosSets(n->fLeftChild);
1140 printPosSets(n->fRightChild);
1141 }
1142 #endif
1143
1144 //
1145 // findDuplCharClassFrom()
1146 //
findDuplCharClassFrom(IntPair * categories)1147 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1148 int32_t numStates = fDStates->size();
1149 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1150
1151 for (; categories->first < numCols-1; categories->first++) {
1152 // Note: dictionary & non-dictionary columns cannot be merged.
1153 // The limitSecond value prevents considering mixed pairs.
1154 // Dictionary categories are >= DictCategoriesStart.
1155 // Non dict categories are < DictCategoriesStart.
1156 int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1157 fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1158 for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1159 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1160 uint16_t table_base = 0;
1161 uint16_t table_dupl = 1;
1162 for (int32_t state=0; state<numStates; state++) {
1163 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1164 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1165 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1166 if (table_base != table_dupl) {
1167 break;
1168 }
1169 }
1170 if (table_base == table_dupl) {
1171 return true;
1172 }
1173 }
1174 }
1175 return false;
1176 }
1177
1178
1179 //
1180 // removeColumn()
1181 //
removeColumn(int32_t column)1182 void RBBITableBuilder::removeColumn(int32_t column) {
1183 int32_t numStates = fDStates->size();
1184 for (int32_t state=0; state<numStates; state++) {
1185 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1186 U_ASSERT(column < sd->fDtran->size());
1187 sd->fDtran->removeElementAt(column);
1188 }
1189 }
1190
1191 /*
1192 * findDuplicateState
1193 */
findDuplicateState(IntPair * states)1194 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1195 int32_t numStates = fDStates->size();
1196 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1197
1198 for (; states->first<numStates-1; states->first++) {
1199 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1200 for (states->second=states->first+1; states->second<numStates; states->second++) {
1201 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1202 if (firstSD->fAccepting != duplSD->fAccepting ||
1203 firstSD->fLookAhead != duplSD->fLookAhead ||
1204 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1205 continue;
1206 }
1207 bool rowsMatch = true;
1208 for (int32_t col=0; col < numCols; ++col) {
1209 int32_t firstVal = firstSD->fDtran->elementAti(col);
1210 int32_t duplVal = duplSD->fDtran->elementAti(col);
1211 if (!((firstVal == duplVal) ||
1212 ((firstVal == states->first || firstVal == states->second) &&
1213 (duplVal == states->first || duplVal == states->second)))) {
1214 rowsMatch = false;
1215 break;
1216 }
1217 }
1218 if (rowsMatch) {
1219 return true;
1220 }
1221 }
1222 }
1223 return false;
1224 }
1225
1226
findDuplicateSafeState(IntPair * states)1227 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1228 int32_t numStates = fSafeTable->size();
1229
1230 for (; states->first<numStates-1; states->first++) {
1231 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1232 for (states->second=states->first+1; states->second<numStates; states->second++) {
1233 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1234 bool rowsMatch = true;
1235 int32_t numCols = firstRow->length();
1236 for (int32_t col=0; col < numCols; ++col) {
1237 int32_t firstVal = firstRow->charAt(col);
1238 int32_t duplVal = duplRow->charAt(col);
1239 if (!((firstVal == duplVal) ||
1240 ((firstVal == states->first || firstVal == states->second) &&
1241 (duplVal == states->first || duplVal == states->second)))) {
1242 rowsMatch = false;
1243 break;
1244 }
1245 }
1246 if (rowsMatch) {
1247 return true;
1248 }
1249 }
1250 }
1251 return false;
1252 }
1253
1254
removeState(IntPair duplStates)1255 void RBBITableBuilder::removeState(IntPair duplStates) {
1256 const int32_t keepState = duplStates.first;
1257 const int32_t duplState = duplStates.second;
1258 U_ASSERT(keepState < duplState);
1259 U_ASSERT(duplState < fDStates->size());
1260
1261 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1262 fDStates->removeElementAt(duplState);
1263 delete duplSD;
1264
1265 int32_t numStates = fDStates->size();
1266 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1267 for (int32_t state=0; state<numStates; ++state) {
1268 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1269 for (int32_t col=0; col<numCols; col++) {
1270 int32_t existingVal = sd->fDtran->elementAti(col);
1271 int32_t newVal = existingVal;
1272 if (existingVal == duplState) {
1273 newVal = keepState;
1274 } else if (existingVal > duplState) {
1275 newVal = existingVal - 1;
1276 }
1277 sd->fDtran->setElementAt(newVal, col);
1278 }
1279 }
1280 }
1281
removeSafeState(IntPair duplStates)1282 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1283 const int32_t keepState = duplStates.first;
1284 const int32_t duplState = duplStates.second;
1285 U_ASSERT(keepState < duplState);
1286 U_ASSERT(duplState < fSafeTable->size());
1287
1288 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1289 // and will auto-delete the removed element.
1290 int32_t numStates = fSafeTable->size();
1291 for (int32_t state=0; state<numStates; ++state) {
1292 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1293 int32_t numCols = sd->length();
1294 for (int32_t col=0; col<numCols; col++) {
1295 int32_t existingVal = sd->charAt(col);
1296 int32_t newVal = existingVal;
1297 if (existingVal == duplState) {
1298 newVal = keepState;
1299 } else if (existingVal > duplState) {
1300 newVal = existingVal - 1;
1301 }
1302 sd->setCharAt(col, static_cast<char16_t>(newVal));
1303 }
1304 }
1305 }
1306
1307
1308 /*
1309 * RemoveDuplicateStates
1310 */
removeDuplicateStates()1311 int32_t RBBITableBuilder::removeDuplicateStates() {
1312 IntPair dupls = {3, 0};
1313 int32_t numStatesRemoved = 0;
1314
1315 while (findDuplicateState(&dupls)) {
1316 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1317 removeState(dupls);
1318 ++numStatesRemoved;
1319 }
1320 return numStatesRemoved;
1321 }
1322
1323
1324 //-----------------------------------------------------------------------------
1325 //
1326 // getTableSize() Calculate the size of the runtime form of this
1327 // state transition table.
1328 //
1329 //-----------------------------------------------------------------------------
getTableSize() const1330 int32_t RBBITableBuilder::getTableSize() const {
1331 int32_t size = 0;
1332 int32_t numRows;
1333 int32_t numCols;
1334 int32_t rowSize;
1335
1336 if (fTree == NULL) {
1337 return 0;
1338 }
1339
1340 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1341
1342 numRows = fDStates->size();
1343 numCols = fRB->fSetBuilder->getNumCharCategories();
1344
1345 if (use8BitsForTable()) {
1346 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1347 } else {
1348 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1349 }
1350 size += numRows * rowSize;
1351 return size;
1352 }
1353
use8BitsForTable() const1354 bool RBBITableBuilder::use8BitsForTable() const {
1355 return fDStates->size() <= kMaxStateFor8BitsTable;
1356 }
1357
1358 //-----------------------------------------------------------------------------
1359 //
1360 // exportTable() export the state transition table in the format required
1361 // by the runtime engine. getTableSize() bytes of memory
1362 // must be available at the output address "where".
1363 //
1364 //-----------------------------------------------------------------------------
exportTable(void * where)1365 void RBBITableBuilder::exportTable(void *where) {
1366 RBBIStateTable *table = (RBBIStateTable *)where;
1367 uint32_t state;
1368 int col;
1369
1370 if (U_FAILURE(*fStatus) || fTree == NULL) {
1371 return;
1372 }
1373
1374 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1375 if (catCount > 0x7fff ||
1376 fDStates->size() > 0x7fff) {
1377 *fStatus = U_BRK_INTERNAL_ERROR;
1378 return;
1379 }
1380
1381 table->fNumStates = fDStates->size();
1382 table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1383 table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1384 table->fFlags = 0;
1385 if (use8BitsForTable()) {
1386 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1387 table->fFlags |= RBBI_8BITS_ROWS;
1388 } else {
1389 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1390 }
1391 if (fRB->fLookAheadHardBreak) {
1392 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1393 }
1394 if (fRB->fSetBuilder->sawBOF()) {
1395 table->fFlags |= RBBI_BOF_REQUIRED;
1396 }
1397
1398 for (state=0; state<table->fNumStates; state++) {
1399 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1400 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1401 if (use8BitsForTable()) {
1402 U_ASSERT (sd->fAccepting <= 255);
1403 U_ASSERT (sd->fLookAhead <= 255);
1404 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1405 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1406 r8->fAccepting = sd->fAccepting;
1407 r8->fLookAhead = sd->fLookAhead;
1408 r8->fTagsIdx = sd->fTagsIdx;
1409 for (col=0; col<catCount; col++) {
1410 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1411 r8->fNextState[col] = sd->fDtran->elementAti(col);
1412 }
1413 } else {
1414 U_ASSERT (sd->fAccepting <= 0xffff);
1415 U_ASSERT (sd->fLookAhead <= 0xffff);
1416 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1417 row->r16.fAccepting = sd->fAccepting;
1418 row->r16.fLookAhead = sd->fLookAhead;
1419 row->r16.fTagsIdx = sd->fTagsIdx;
1420 for (col=0; col<catCount; col++) {
1421 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1422 }
1423 }
1424 }
1425 }
1426
1427
1428 /**
1429 * Synthesize a safe state table from the main state table.
1430 */
buildSafeReverseTable(UErrorCode & status)1431 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1432 // The safe table creation has three steps:
1433
1434 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1435 // following the pair do not depend on context or state before the pair. To test
1436 // whether a pair is safe, run it through the main forward state table, starting
1437 // from each state. If the the final state is the same, no matter what the starting state,
1438 // the pair is safe.
1439 //
1440 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1441 // forward table, with a column for each input character [class], and a row for
1442 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1443 // create an additional state for each input character category; being in
1444 // one of these states means that the character has been seen, and is potentially
1445 // the first of a pair. In each of these rows, the entry for the second character
1446 // of a safe pair is set to the stop state (0), indicating that a match was found.
1447 // All other table entries are set to the state corresponding the current input
1448 // character, allowing that charcter to be the of a start following pair.
1449 //
1450 // Because the safe rules are to be run in reverse, moving backwards in the text,
1451 // the first and second pair categories are swapped when building the table.
1452 //
1453 // 3. Compress the table. There are typically many rows (states) that are
1454 // equivalent - that have zeroes (match completed) in the same columns -
1455 // and can be folded together.
1456
1457 // Each safe pair is stored as two UChars in the safePair string.
1458 UnicodeString safePairs;
1459
1460 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1461 int32_t numStates = fDStates->size();
1462
1463 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1464 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1465 int32_t wantedEndState = -1;
1466 int32_t endState = 0;
1467 for (int32_t startState = 1; startState < numStates; ++startState) {
1468 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1469 int32_t s2 = startStateD->fDtran->elementAti(c1);
1470 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1471 endState = s2StateD->fDtran->elementAti(c2);
1472 if (wantedEndState < 0) {
1473 wantedEndState = endState;
1474 } else {
1475 if (wantedEndState != endState) {
1476 break;
1477 }
1478 }
1479 }
1480 if (wantedEndState == endState) {
1481 safePairs.append((char16_t)c1);
1482 safePairs.append((char16_t)c2);
1483 // printf("(%d, %d) ", c1, c2);
1484 }
1485 }
1486 // printf("\n");
1487 }
1488
1489 // Populate the initial safe table.
1490 // The table as a whole is UVector<UnicodeString>
1491 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1492 // Row 0 is the stop state.
1493 // Row 1 is the start sate.
1494 // Row 2 and beyond are other states, initially one per char class, but
1495 // after initial construction, many of the states will be combined, compacting the table.
1496 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1497 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1498
1499 U_ASSERT(fSafeTable == nullptr);
1500 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1501 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1502 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1503 }
1504
1505 // From the start state, each input char class transitions to the state for that input.
1506 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1507 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1508 // Note: +2 for the start & stop state.
1509 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1510 }
1511
1512 // Initially make every other state table row look like the start state row,
1513 for (int32_t row=2; row<numCharClasses+2; ++row) {
1514 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1515 rowState = startState; // UnicodeString assignment, copies contents.
1516 }
1517
1518 // Run through the safe pairs, set the next state to zero when pair has been seen.
1519 // Zero being the stop state, meaning we found a safe point.
1520 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1521 int32_t c1 = safePairs.charAt(pairIdx);
1522 int32_t c2 = safePairs.charAt(pairIdx + 1);
1523
1524 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1525 rowState.setCharAt(c1, 0);
1526 }
1527
1528 // Remove duplicate or redundant rows from the table.
1529 IntPair states = {1, 0};
1530 while (findDuplicateSafeState(&states)) {
1531 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1532 removeSafeState(states);
1533 }
1534 }
1535
1536
1537 //-----------------------------------------------------------------------------
1538 //
1539 // getSafeTableSize() Calculate the size of the runtime form of this
1540 // safe state table.
1541 //
1542 //-----------------------------------------------------------------------------
getSafeTableSize() const1543 int32_t RBBITableBuilder::getSafeTableSize() const {
1544 int32_t size = 0;
1545 int32_t numRows;
1546 int32_t numCols;
1547 int32_t rowSize;
1548
1549 if (fSafeTable == nullptr) {
1550 return 0;
1551 }
1552
1553 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1554
1555 numRows = fSafeTable->size();
1556 numCols = fRB->fSetBuilder->getNumCharCategories();
1557
1558 if (use8BitsForSafeTable()) {
1559 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1560 } else {
1561 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1562 }
1563 size += numRows * rowSize;
1564 return size;
1565 }
1566
use8BitsForSafeTable() const1567 bool RBBITableBuilder::use8BitsForSafeTable() const {
1568 return fSafeTable->size() <= kMaxStateFor8BitsTable;
1569 }
1570
1571 //-----------------------------------------------------------------------------
1572 //
1573 // exportSafeTable() export the state transition table in the format required
1574 // by the runtime engine. getTableSize() bytes of memory
1575 // must be available at the output address "where".
1576 //
1577 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1578 void RBBITableBuilder::exportSafeTable(void *where) {
1579 RBBIStateTable *table = (RBBIStateTable *)where;
1580 uint32_t state;
1581 int col;
1582
1583 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1584 return;
1585 }
1586
1587 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1588 if (catCount > 0x7fff ||
1589 fSafeTable->size() > 0x7fff) {
1590 *fStatus = U_BRK_INTERNAL_ERROR;
1591 return;
1592 }
1593
1594 table->fNumStates = fSafeTable->size();
1595 table->fFlags = 0;
1596 if (use8BitsForSafeTable()) {
1597 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1598 table->fFlags |= RBBI_8BITS_ROWS;
1599 } else {
1600 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1601 }
1602
1603 for (state=0; state<table->fNumStates; state++) {
1604 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1605 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1606 if (use8BitsForSafeTable()) {
1607 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1608 r8->fAccepting = 0;
1609 r8->fLookAhead = 0;
1610 r8->fTagsIdx = 0;
1611 for (col=0; col<catCount; col++) {
1612 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1613 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1614 }
1615 } else {
1616 row->r16.fAccepting = 0;
1617 row->r16.fLookAhead = 0;
1618 row->r16.fTagsIdx = 0;
1619 for (col=0; col<catCount; col++) {
1620 row->r16.fNextState[col] = rowString->charAt(col);
1621 }
1622 }
1623 }
1624 }
1625
1626
1627
1628
1629 //-----------------------------------------------------------------------------
1630 //
1631 // printSet Debug function. Print the contents of a UVector
1632 //
1633 //-----------------------------------------------------------------------------
1634 #ifdef RBBI_DEBUG
printSet(UVector * s)1635 void RBBITableBuilder::printSet(UVector *s) {
1636 int32_t i;
1637 for (i=0; i<s->size(); i++) {
1638 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1639 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1640 }
1641 RBBIDebugPrintf("\n");
1642 }
1643 #endif
1644
1645
1646 //-----------------------------------------------------------------------------
1647 //
1648 // printStates Debug Function. Dump the fully constructed state transition table.
1649 //
1650 //-----------------------------------------------------------------------------
1651 #ifdef RBBI_DEBUG
printStates()1652 void RBBITableBuilder::printStates() {
1653 int c; // input "character"
1654 int n; // state number
1655
1656 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1657 RBBIDebugPrintf(" | Acc LA Tag");
1658 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1659 RBBIDebugPrintf(" %3d", c);
1660 }
1661 RBBIDebugPrintf("\n");
1662 RBBIDebugPrintf(" |---------------");
1663 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1664 RBBIDebugPrintf("----");
1665 }
1666 RBBIDebugPrintf("\n");
1667
1668 for (n=0; n<fDStates->size(); n++) {
1669 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1670 RBBIDebugPrintf(" %3d | " , n);
1671 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1672 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1673 RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1674 }
1675 RBBIDebugPrintf("\n");
1676 }
1677 RBBIDebugPrintf("\n\n");
1678 }
1679 #endif
1680
1681
1682 //-----------------------------------------------------------------------------
1683 //
1684 // printSafeTable Debug Function. Dump the fully constructed safe table.
1685 //
1686 //-----------------------------------------------------------------------------
1687 #ifdef RBBI_DEBUG
printReverseTable()1688 void RBBITableBuilder::printReverseTable() {
1689 int c; // input "character"
1690 int n; // state number
1691
1692 RBBIDebugPrintf(" Safe Reverse Table \n");
1693 if (fSafeTable == nullptr) {
1694 RBBIDebugPrintf(" --- nullptr ---\n");
1695 return;
1696 }
1697 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1698 RBBIDebugPrintf(" | Acc LA Tag");
1699 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1700 RBBIDebugPrintf(" %2d", c);
1701 }
1702 RBBIDebugPrintf("\n");
1703 RBBIDebugPrintf(" |---------------");
1704 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1705 RBBIDebugPrintf("---");
1706 }
1707 RBBIDebugPrintf("\n");
1708
1709 for (n=0; n<fSafeTable->size(); n++) {
1710 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1711 RBBIDebugPrintf(" %3d | " , n);
1712 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1713 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1714 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1715 }
1716 RBBIDebugPrintf("\n");
1717 }
1718 RBBIDebugPrintf("\n\n");
1719 }
1720 #endif
1721
1722
1723
1724 //-----------------------------------------------------------------------------
1725 //
1726 // printRuleStatusTable Debug Function. Dump the common rule status table
1727 //
1728 //-----------------------------------------------------------------------------
1729 #ifdef RBBI_DEBUG
printRuleStatusTable()1730 void RBBITableBuilder::printRuleStatusTable() {
1731 int32_t thisRecord = 0;
1732 int32_t nextRecord = 0;
1733 int i;
1734 UVector *tbl = fRB->fRuleStatusVals;
1735
1736 RBBIDebugPrintf("index | tags \n");
1737 RBBIDebugPrintf("-------------------\n");
1738
1739 while (nextRecord < tbl->size()) {
1740 thisRecord = nextRecord;
1741 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1742 RBBIDebugPrintf("%4d ", thisRecord);
1743 for (i=thisRecord+1; i<nextRecord; i++) {
1744 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1745 }
1746 RBBIDebugPrintf("\n");
1747 }
1748 RBBIDebugPrintf("\n\n");
1749 }
1750 #endif
1751
1752
1753 //-----------------------------------------------------------------------------
1754 //
1755 // RBBIStateDescriptor Methods. This is a very struct-like class
1756 // Most access is directly to the fields.
1757 //
1758 //-----------------------------------------------------------------------------
1759
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1760 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1761 fMarked = FALSE;
1762 fAccepting = 0;
1763 fLookAhead = 0;
1764 fTagsIdx = 0;
1765 fTagVals = NULL;
1766 fPositions = NULL;
1767 fDtran = NULL;
1768
1769 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1770 if (U_FAILURE(*fStatus)) {
1771 return;
1772 }
1773 if (fDtran == NULL) {
1774 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1775 return;
1776 }
1777 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1778 // It is indexed by input symbols, and will
1779 // hold the next state number for each
1780 // symbol.
1781 }
1782
1783
~RBBIStateDescriptor()1784 RBBIStateDescriptor::~RBBIStateDescriptor() {
1785 delete fPositions;
1786 delete fDtran;
1787 delete fTagVals;
1788 fPositions = NULL;
1789 fDtran = NULL;
1790 fTagVals = NULL;
1791 }
1792
1793 U_NAMESPACE_END
1794
1795 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1796