1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28
29 U_NAMESPACE_BEGIN
30
31 const int32_t kMaxStateFor8BitsTable = 255;
32
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34 fRB(rb),
35 fTree(*rootNode),
36 fStatus(&status),
37 fDStates(nullptr),
38 fSafeTable(nullptr) {
39 if (U_FAILURE(status)) {
40 return;
41 }
42 // fDStates is UVector<RBBIStateDescriptor *>
43 fDStates = new UVector(status);
44 if (U_SUCCESS(status) && fDStates == nullptr ) {
45 status = U_MEMORY_ALLOCATION_ERROR;
46 }
47 }
48
49
50
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52 int i;
53 for (i=0; i<fDStates->size(); i++) {
54 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55 }
56 delete fDStates;
57 delete fSafeTable;
58 delete fLookAheadRuleMap;
59 }
60
61
62 //-----------------------------------------------------------------------------
63 //
64 // RBBITableBuilder::buildForwardTable - This is the main function for building
65 // the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void RBBITableBuilder::buildForwardTable() {
69
70 if (U_FAILURE(*fStatus)) {
71 return;
72 }
73
74 // If there were no rules, just return. This situation can easily arise
75 // for the reverse rules.
76 if (fTree==NULL) {
77 return;
78 }
79
80 //
81 // Walk through the tree, replacing any references to $variables with a copy of the
82 // parse tree for the substitution expression.
83 //
84 fTree = fTree->flattenVariables();
85 #ifdef RBBI_DEBUG
86 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
87 RBBIDebugPuts("\nParse tree after flattening variable references.");
88 RBBINode::printTree(fTree, TRUE);
89 }
90 #endif
91
92 //
93 // If the rules contained any references to {bof}
94 // add a {bof} <cat> <former root of tree> to the
95 // tree. Means that all matches must start out with the
96 // {bof} fake character.
97 //
98 if (fRB->fSetBuilder->sawBOF()) {
99 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
100 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
101 // Delete and exit if memory allocation failed.
102 if (bofTop == NULL || bofLeaf == NULL) {
103 *fStatus = U_MEMORY_ALLOCATION_ERROR;
104 delete bofTop;
105 delete bofLeaf;
106 return;
107 }
108 bofTop->fLeftChild = bofLeaf;
109 bofTop->fRightChild = fTree;
110 bofLeaf->fParent = bofTop;
111 bofLeaf->fVal = 2; // Reserved value for {bof}.
112 fTree = bofTop;
113 }
114
115 //
116 // Add a unique right-end marker to the expression.
117 // Appears as a cat-node, left child being the original tree,
118 // right child being the end marker.
119 //
120 RBBINode *cn = new RBBINode(RBBINode::opCat);
121 // Exit if memory allocation failed.
122 if (cn == NULL) {
123 *fStatus = U_MEMORY_ALLOCATION_ERROR;
124 return;
125 }
126 cn->fLeftChild = fTree;
127 fTree->fParent = cn;
128 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
129 // Delete and exit if memory allocation failed.
130 if (cn->fRightChild == NULL) {
131 *fStatus = U_MEMORY_ALLOCATION_ERROR;
132 delete cn;
133 return;
134 }
135 cn->fRightChild->fParent = cn;
136 fTree = cn;
137
138 //
139 // Replace all references to UnicodeSets with the tree for the equivalent
140 // expression.
141 //
142 fTree->flattenSets();
143 #ifdef RBBI_DEBUG
144 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
145 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
146 RBBINode::printTree(fTree, TRUE);
147 }
148 #endif
149
150
151 //
152 // calculate the functions nullable, firstpos, lastpos and followpos on
153 // nodes in the parse tree.
154 // See the algorithm description in Aho.
155 // Understanding how this works by looking at the code alone will be
156 // nearly impossible.
157 //
158 calcNullable(fTree);
159 calcFirstPos(fTree);
160 calcLastPos(fTree);
161 calcFollowPos(fTree);
162 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
163 RBBIDebugPuts("\n");
164 printPosSets(fTree);
165 }
166
167 //
168 // For "chained" rules, modify the followPos sets
169 //
170 if (fRB->fChainRules) {
171 calcChainedFollowPos(fTree, endMarkerNode);
172 }
173
174 //
175 // BOF (start of input) test fixup.
176 //
177 if (fRB->fSetBuilder->sawBOF()) {
178 bofFixup();
179 }
180
181 //
182 // Build the DFA state transition tables.
183 //
184 buildStateTable();
185 mapLookAheadRules();
186 flagAcceptingStates();
187 flagLookAheadStates();
188 flagTaggedStates();
189
190 //
191 // Update the global table of rule status {tag} values
192 // The rule builder has a global vector of status values that are common
193 // for all tables. Merge the ones from this table into the global set.
194 //
195 mergeRuleStatusVals();
196 }
197
198
199
200 //-----------------------------------------------------------------------------
201 //
202 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
203 //
204 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)205 void RBBITableBuilder::calcNullable(RBBINode *n) {
206 if (n == NULL) {
207 return;
208 }
209 if (n->fType == RBBINode::setRef ||
210 n->fType == RBBINode::endMark ) {
211 // These are non-empty leaf node types.
212 n->fNullable = FALSE;
213 return;
214 }
215
216 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
217 // Lookahead marker node. It's a leaf, so no recursion on children.
218 // It's nullable because it does not match any literal text from the input stream.
219 n->fNullable = TRUE;
220 return;
221 }
222
223
224 // The node is not a leaf.
225 // Calculate nullable on its children.
226 calcNullable(n->fLeftChild);
227 calcNullable(n->fRightChild);
228
229 // Apply functions from table 3.40 in Aho
230 if (n->fType == RBBINode::opOr) {
231 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
232 }
233 else if (n->fType == RBBINode::opCat) {
234 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
235 }
236 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
237 n->fNullable = TRUE;
238 }
239 else {
240 n->fNullable = FALSE;
241 }
242 }
243
244
245
246
247 //-----------------------------------------------------------------------------
248 //
249 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
250 //
251 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)252 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
253 if (n == NULL) {
254 return;
255 }
256 if (n->fType == RBBINode::leafChar ||
257 n->fType == RBBINode::endMark ||
258 n->fType == RBBINode::lookAhead ||
259 n->fType == RBBINode::tag) {
260 // These are non-empty leaf node types.
261 // Note: In order to maintain the sort invariant on the set,
262 // this function should only be called on a node whose set is
263 // empty to start with.
264 n->fFirstPosSet->addElement(n, *fStatus);
265 return;
266 }
267
268 // The node is not a leaf.
269 // Calculate firstPos on its children.
270 calcFirstPos(n->fLeftChild);
271 calcFirstPos(n->fRightChild);
272
273 // Apply functions from table 3.40 in Aho
274 if (n->fType == RBBINode::opOr) {
275 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 else if (n->fType == RBBINode::opCat) {
279 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
280 if (n->fLeftChild->fNullable) {
281 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
282 }
283 }
284 else if (n->fType == RBBINode::opStar ||
285 n->fType == RBBINode::opQuestion ||
286 n->fType == RBBINode::opPlus) {
287 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
288 }
289 }
290
291
292
293 //-----------------------------------------------------------------------------
294 //
295 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
296 //
297 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)298 void RBBITableBuilder::calcLastPos(RBBINode *n) {
299 if (n == NULL) {
300 return;
301 }
302 if (n->fType == RBBINode::leafChar ||
303 n->fType == RBBINode::endMark ||
304 n->fType == RBBINode::lookAhead ||
305 n->fType == RBBINode::tag) {
306 // These are non-empty leaf node types.
307 // Note: In order to maintain the sort invariant on the set,
308 // this function should only be called on a node whose set is
309 // empty to start with.
310 n->fLastPosSet->addElement(n, *fStatus);
311 return;
312 }
313
314 // The node is not a leaf.
315 // Calculate lastPos on its children.
316 calcLastPos(n->fLeftChild);
317 calcLastPos(n->fRightChild);
318
319 // Apply functions from table 3.40 in Aho
320 if (n->fType == RBBINode::opOr) {
321 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323 }
324 else if (n->fType == RBBINode::opCat) {
325 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326 if (n->fRightChild->fNullable) {
327 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328 }
329 }
330 else if (n->fType == RBBINode::opStar ||
331 n->fType == RBBINode::opQuestion ||
332 n->fType == RBBINode::opPlus) {
333 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
334 }
335 }
336
337
338
339 //-----------------------------------------------------------------------------
340 //
341 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
342 //
343 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)344 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
345 if (n == NULL ||
346 n->fType == RBBINode::leafChar ||
347 n->fType == RBBINode::endMark) {
348 return;
349 }
350
351 calcFollowPos(n->fLeftChild);
352 calcFollowPos(n->fRightChild);
353
354 // Aho rule #1
355 if (n->fType == RBBINode::opCat) {
356 RBBINode *i; // is 'i' in Aho's description
357 uint32_t ix;
358
359 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
360
361 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
362 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
363 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
364 }
365 }
366
367 // Aho rule #2
368 if (n->fType == RBBINode::opStar ||
369 n->fType == RBBINode::opPlus) {
370 RBBINode *i; // again, n and i are the names from Aho's description.
371 uint32_t ix;
372
373 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
374 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
375 setAdd(i->fFollowPos, n->fFirstPosSet);
376 }
377 }
378
379
380
381 }
382
383 //-----------------------------------------------------------------------------
384 //
385 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
386 // as roots of a rule to a destination vector.
387 //
388 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)389 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
390 if (node == NULL || U_FAILURE(*fStatus)) {
391 return;
392 }
393 U_ASSERT(!dest->hasDeleter());
394 if (node->fRuleRoot) {
395 dest->addElement(node, *fStatus);
396 // Note: rules cannot nest. If we found a rule start node,
397 // no child node can also be a start node.
398 return;
399 }
400 addRuleRootNodes(dest, node->fLeftChild);
401 addRuleRootNodes(dest, node->fRightChild);
402 }
403
404 //-----------------------------------------------------------------------------
405 //
406 // calcChainedFollowPos. Modify the previously calculated followPos sets
407 // to implement rule chaining. NOT described by Aho
408 //
409 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)410 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
411
412 UVector leafNodes(*fStatus);
413 if (U_FAILURE(*fStatus)) {
414 return;
415 }
416
417 // get a list all leaf nodes
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
429
430 UVector matchStartNodes(*fStatus);
431 for (int j=0; j<ruleRootNodes.size(); ++j) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
440
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446
447 // Identify leaf nodes that correspond to overall rule match positions.
448 // These include the endMarkNode in their followPos sets.
449 //
450 // Note: do not consider other end marker nodes, those that are added to
451 // look-ahead rules. These can't chain; a match immediately stops
452 // further matching. This leaves exactly one end marker node, the one
453 // at the end of the complete tree.
454
455 if (!endNode->fFollowPos->contains(endMarkNode)) {
456 continue;
457 }
458
459 // We've got a node that can end a match.
460
461 // !!LBCMNoChain implementation: If this node's val correspond to
462 // the Line Break $CM char class, don't chain from it.
463 // TODO: Remove this. !!LBCMNoChain is deprecated, and is not used
464 // by any of the standard ICU rules.
465 if (fRB->fLBCMNoChain) {
466 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
467 if (c != -1) {
468 // c == -1 occurs with sets containing only the {eof} marker string.
469 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
470 if (cLBProp == U_LB_COMBINING_MARK) {
471 continue;
472 }
473 }
474 }
475
476 // Now iterate over the nodes that can start a match, looking for ones
477 // with the same char class as our ending node.
478 RBBINode *startNode;
479 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
480 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
481 if (startNode->fType != RBBINode::leafChar) {
482 continue;
483 }
484
485 if (endNode->fVal == startNode->fVal) {
486 // The end val (character class) of one possible match is the
487 // same as the start of another.
488
489 // Add all nodes from the followPos of the start node to the
490 // followPos set of the end node, which will have the effect of
491 // letting matches transition from a match state at endNode
492 // to the second char of a match starting with startNode.
493 setAdd(endNode->fFollowPos, startNode->fFollowPos);
494 }
495 }
496 }
497 }
498
499
500 //-----------------------------------------------------------------------------
501 //
502 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
503 // Do an swizzle similar to chaining, modifying the followPos set of
504 // the bofNode to include the followPos nodes from other {bot} nodes
505 // scattered through the tree.
506 //
507 // This function has much in common with calcChainedFollowPos().
508 //
509 //-----------------------------------------------------------------------------
bofFixup()510 void RBBITableBuilder::bofFixup() {
511
512 if (U_FAILURE(*fStatus)) {
513 return;
514 }
515
516 // The parse tree looks like this ...
517 // fTree root ---> <cat>
518 // / \ .
519 // <cat> <#end node>
520 // / \ .
521 // <bofNode> rest
522 // of tree
523 //
524 // We will be adding things to the followPos set of the <bofNode>
525 //
526 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
527 U_ASSERT(bofNode->fType == RBBINode::leafChar);
528 U_ASSERT(bofNode->fVal == 2);
529
530 // Get all nodes that can be the start a match of the user-written rules
531 // (excluding the fake bofNode)
532 // We want the nodes that can start a match in the
533 // part labeled "rest of tree"
534 //
535 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
536
537 RBBINode *startNode;
538 int startNodeIx;
539 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
540 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
541 if (startNode->fType != RBBINode::leafChar) {
542 continue;
543 }
544
545 if (startNode->fVal == bofNode->fVal) {
546 // We found a leaf node corresponding to a {bof} that was
547 // explicitly written into a rule.
548 // Add everything from the followPos set of this node to the
549 // followPos set of the fake bofNode at the start of the tree.
550 //
551 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
552 }
553 }
554 }
555
556 //-----------------------------------------------------------------------------
557 //
558 // buildStateTable() Determine the set of runtime DFA states and the
559 // transition tables for these states, by the algorithm
560 // of fig. 3.44 in Aho.
561 //
562 // Most of the comments are quotes of Aho's psuedo-code.
563 //
564 //-----------------------------------------------------------------------------
buildStateTable()565 void RBBITableBuilder::buildStateTable() {
566 if (U_FAILURE(*fStatus)) {
567 return;
568 }
569 RBBIStateDescriptor *failState;
570 // Set it to NULL to avoid uninitialized warning
571 RBBIStateDescriptor *initialState = NULL;
572 //
573 // Add a dummy state 0 - the stop state. Not from Aho.
574 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
575 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
576 if (failState == NULL) {
577 *fStatus = U_MEMORY_ALLOCATION_ERROR;
578 goto ExitBuildSTdeleteall;
579 }
580 failState->fPositions = new UVector(*fStatus);
581 if (failState->fPositions == NULL) {
582 *fStatus = U_MEMORY_ALLOCATION_ERROR;
583 }
584 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
585 goto ExitBuildSTdeleteall;
586 }
587 fDStates->addElement(failState, *fStatus);
588 if (U_FAILURE(*fStatus)) {
589 goto ExitBuildSTdeleteall;
590 }
591
592 // initially, the only unmarked state in Dstates is firstpos(root),
593 // where toot is the root of the syntax tree for (r)#;
594 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
595 if (initialState == NULL) {
596 *fStatus = U_MEMORY_ALLOCATION_ERROR;
597 }
598 if (U_FAILURE(*fStatus)) {
599 goto ExitBuildSTdeleteall;
600 }
601 initialState->fPositions = new UVector(*fStatus);
602 if (initialState->fPositions == NULL) {
603 *fStatus = U_MEMORY_ALLOCATION_ERROR;
604 }
605 if (U_FAILURE(*fStatus)) {
606 goto ExitBuildSTdeleteall;
607 }
608 setAdd(initialState->fPositions, fTree->fFirstPosSet);
609 fDStates->addElement(initialState, *fStatus);
610 if (U_FAILURE(*fStatus)) {
611 goto ExitBuildSTdeleteall;
612 }
613
614 // while there is an unmarked state T in Dstates do begin
615 for (;;) {
616 RBBIStateDescriptor *T = NULL;
617 int32_t tx;
618 for (tx=1; tx<fDStates->size(); tx++) {
619 RBBIStateDescriptor *temp;
620 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
621 if (temp->fMarked == FALSE) {
622 T = temp;
623 break;
624 }
625 }
626 if (T == NULL) {
627 break;
628 }
629
630 // mark T;
631 T->fMarked = TRUE;
632
633 // for each input symbol a do begin
634 int32_t a;
635 for (a = 1; a<=lastInputSymbol; a++) {
636 // let U be the set of positions that are in followpos(p)
637 // for some position p in T
638 // such that the symbol at position p is a;
639 UVector *U = NULL;
640 RBBINode *p;
641 int32_t px;
642 for (px=0; px<T->fPositions->size(); px++) {
643 p = (RBBINode *)T->fPositions->elementAt(px);
644 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
645 if (U == NULL) {
646 U = new UVector(*fStatus);
647 if (U == NULL) {
648 *fStatus = U_MEMORY_ALLOCATION_ERROR;
649 goto ExitBuildSTdeleteall;
650 }
651 }
652 setAdd(U, p->fFollowPos);
653 }
654 }
655
656 // if U is not empty and not in DStates then
657 int32_t ux = 0;
658 UBool UinDstates = FALSE;
659 if (U != NULL) {
660 U_ASSERT(U->size() > 0);
661 int ix;
662 for (ix=0; ix<fDStates->size(); ix++) {
663 RBBIStateDescriptor *temp2;
664 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
665 if (setEquals(U, temp2->fPositions)) {
666 delete U;
667 U = temp2->fPositions;
668 ux = ix;
669 UinDstates = TRUE;
670 break;
671 }
672 }
673
674 // Add U as an unmarked state to Dstates
675 if (!UinDstates)
676 {
677 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
678 if (newState == NULL) {
679 *fStatus = U_MEMORY_ALLOCATION_ERROR;
680 }
681 if (U_FAILURE(*fStatus)) {
682 goto ExitBuildSTdeleteall;
683 }
684 newState->fPositions = U;
685 fDStates->addElement(newState, *fStatus);
686 if (U_FAILURE(*fStatus)) {
687 return;
688 }
689 ux = fDStates->size()-1;
690 }
691
692 // Dtran[T, a] := U;
693 T->fDtran->setElementAt(ux, a);
694 }
695 }
696 }
697 return;
698 // delete local pointers only if error occurred.
699 ExitBuildSTdeleteall:
700 delete initialState;
701 delete failState;
702 }
703
704
705 /**
706 * mapLookAheadRules
707 *
708 */
mapLookAheadRules()709 void RBBITableBuilder::mapLookAheadRules() {
710 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
711 if (fLookAheadRuleMap == nullptr) {
712 *fStatus = U_MEMORY_ALLOCATION_ERROR;
713 }
714 if (U_FAILURE(*fStatus)) {
715 return;
716 }
717 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
718
719 for (int32_t n=0; n<fDStates->size(); n++) {
720 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
721 int32_t laSlotForState = 0;
722
723 // Establish the look-ahead slot for this state, if the state covers
724 // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
725
726 // If any of the look-ahead nodes already have a slot assigned, use it,
727 // otherwise assign a new one.
728
729 bool sawLookAheadNode = false;
730 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
731 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
732 if (node->fType != RBBINode::NodeType::lookAhead) {
733 continue;
734 }
735 sawLookAheadNode = true;
736 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
737 U_ASSERT(ruleNum < fLookAheadRuleMap->size());
738 U_ASSERT(ruleNum > 0);
739 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
740 if (laSlot != 0) {
741 if (laSlotForState == 0) {
742 laSlotForState = laSlot;
743 } else {
744 // TODO: figure out if this can fail, change to setting an error code if so.
745 U_ASSERT(laSlot == laSlotForState);
746 }
747 }
748 }
749 if (!sawLookAheadNode) {
750 continue;
751 }
752
753 if (laSlotForState == 0) {
754 laSlotForState = ++fLASlotsInUse;
755 }
756
757 // For each look ahead node covered by this state,
758 // set the mapping from the node's rule number to the look ahead slot.
759 // There can be multiple nodes/rule numbers going to the same la slot.
760
761 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
762 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
763 if (node->fType != RBBINode::NodeType::lookAhead) {
764 continue;
765 }
766 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
767 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
768 (void)existingVal;
769 U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
770 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
771 }
772 }
773
774 }
775
776 //-----------------------------------------------------------------------------
777 //
778 // flagAcceptingStates Identify accepting states.
779 // First get a list of all of the end marker nodes.
780 // Then, for each state s,
781 // if s contains one of the end marker nodes in its list of tree positions then
782 // s is an accepting state.
783 //
784 //-----------------------------------------------------------------------------
flagAcceptingStates()785 void RBBITableBuilder::flagAcceptingStates() {
786 if (U_FAILURE(*fStatus)) {
787 return;
788 }
789 UVector endMarkerNodes(*fStatus);
790 RBBINode *endMarker;
791 int32_t i;
792 int32_t n;
793
794 if (U_FAILURE(*fStatus)) {
795 return;
796 }
797
798 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
799 if (U_FAILURE(*fStatus)) {
800 return;
801 }
802
803 for (i=0; i<endMarkerNodes.size(); i++) {
804 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
805 for (n=0; n<fDStates->size(); n++) {
806 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
807 if (sd->fPositions->indexOf(endMarker) >= 0) {
808 // Any non-zero value for fAccepting means this is an accepting node.
809 // The value is what will be returned to the user as the break status.
810 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
811
812 if (sd->fAccepting==0) {
813 // State hasn't been marked as accepting yet. Do it now.
814 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
815 if (sd->fAccepting == 0) {
816 sd->fAccepting = ACCEPTING_UNCONDITIONAL;
817 }
818 }
819 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
820 // Both lookahead and non-lookahead accepting for this state.
821 // Favor the look-ahead, because a look-ahead match needs to
822 // immediately stop the run-time engine. First match, not longest.
823 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
824 }
825 // implicit else:
826 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
827 }
828 }
829 }
830 }
831
832
833 //-----------------------------------------------------------------------------
834 //
835 // flagLookAheadStates Very similar to flagAcceptingStates, above.
836 //
837 //-----------------------------------------------------------------------------
flagLookAheadStates()838 void RBBITableBuilder::flagLookAheadStates() {
839 if (U_FAILURE(*fStatus)) {
840 return;
841 }
842 UVector lookAheadNodes(*fStatus);
843 RBBINode *lookAheadNode;
844 int32_t i;
845 int32_t n;
846
847 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
848 if (U_FAILURE(*fStatus)) {
849 return;
850 }
851 for (i=0; i<lookAheadNodes.size(); i++) {
852 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
853 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
854
855 for (n=0; n<fDStates->size(); n++) {
856 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
857 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
858 if (positionsIdx >= 0) {
859 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
860 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
861 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
862 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
863 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
864 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
865 // }
866 sd->fLookAhead = lookaheadSlot;
867 }
868 }
869 }
870 }
871
872
873
874
875 //-----------------------------------------------------------------------------
876 //
877 // flagTaggedStates
878 //
879 //-----------------------------------------------------------------------------
flagTaggedStates()880 void RBBITableBuilder::flagTaggedStates() {
881 if (U_FAILURE(*fStatus)) {
882 return;
883 }
884 UVector tagNodes(*fStatus);
885 RBBINode *tagNode;
886 int32_t i;
887 int32_t n;
888
889 if (U_FAILURE(*fStatus)) {
890 return;
891 }
892 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
893 if (U_FAILURE(*fStatus)) {
894 return;
895 }
896 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
897 tagNode = (RBBINode *)tagNodes.elementAt(i);
898
899 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
900 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
901 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
902 sortedAdd(&sd->fTagVals, tagNode->fVal);
903 }
904 }
905 }
906 }
907
908
909
910
911 //-----------------------------------------------------------------------------
912 //
913 // mergeRuleStatusVals
914 //
915 // Update the global table of rule status {tag} values
916 // The rule builder has a global vector of status values that are common
917 // for all tables. Merge the ones from this table into the global set.
918 //
919 //-----------------------------------------------------------------------------
mergeRuleStatusVals()920 void RBBITableBuilder::mergeRuleStatusVals() {
921 //
922 // The basic outline of what happens here is this...
923 //
924 // for each state in this state table
925 // if the status tag list for this state is in the global statuses list
926 // record where and
927 // continue with the next state
928 // else
929 // add the tag list for this state to the global list.
930 //
931 int i;
932 int n;
933
934 // Pre-set a single tag of {0} into the table.
935 // We will need this as a default, for rule sets with no explicit tagging.
936 if (fRB->fRuleStatusVals->size() == 0) {
937 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
938 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
939 }
940
941 // For each state
942 for (n=0; n<fDStates->size(); n++) {
943 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
944 UVector *thisStatesTagValues = sd->fTagVals;
945 if (thisStatesTagValues == NULL) {
946 // No tag values are explicitly associated with this state.
947 // Set the default tag value.
948 sd->fTagsIdx = 0;
949 continue;
950 }
951
952 // There are tag(s) associated with this state.
953 // fTagsIdx will be the index into the global tag list for this state's tag values.
954 // Initial value of -1 flags that we haven't got it set yet.
955 sd->fTagsIdx = -1;
956 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
957 int32_t nextTagGroupStart = 0;
958
959 // Loop runs once per group of tags in the global list
960 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
961 thisTagGroupStart = nextTagGroupStart;
962 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
963 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
964 // The number of tags for this state is different from
965 // the number of tags in this group from the global list.
966 // Continue with the next group from the global list.
967 continue;
968 }
969 // The lengths match, go ahead and compare the actual tag values
970 // between this state and the group from the global list.
971 for (i=0; i<thisStatesTagValues->size(); i++) {
972 if (thisStatesTagValues->elementAti(i) !=
973 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
974 // Mismatch.
975 break;
976 }
977 }
978
979 if (i == thisStatesTagValues->size()) {
980 // We found a set of tag values in the global list that match
981 // those for this state. Use them.
982 sd->fTagsIdx = thisTagGroupStart;
983 break;
984 }
985 }
986
987 if (sd->fTagsIdx == -1) {
988 // No suitable entry in the global tag list already. Add one
989 sd->fTagsIdx = fRB->fRuleStatusVals->size();
990 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
991 for (i=0; i<thisStatesTagValues->size(); i++) {
992 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
993 }
994 }
995 }
996 }
997
998
999
1000
1001
1002
1003
1004 //-----------------------------------------------------------------------------
1005 //
1006 // sortedAdd Add a value to a vector of sorted values (ints).
1007 // Do not replicate entries; if the value is already there, do not
1008 // add a second one.
1009 // Lazily create the vector if it does not already exist.
1010 //
1011 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1012 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1013 int32_t i;
1014
1015 if (*vector == NULL) {
1016 *vector = new UVector(*fStatus);
1017 }
1018 if (*vector == NULL || U_FAILURE(*fStatus)) {
1019 return;
1020 }
1021 UVector *vec = *vector;
1022 int32_t vSize = vec->size();
1023 for (i=0; i<vSize; i++) {
1024 int32_t valAtI = vec->elementAti(i);
1025 if (valAtI == val) {
1026 // The value is already in the vector. Don't add it again.
1027 return;
1028 }
1029 if (valAtI > val) {
1030 break;
1031 }
1032 }
1033 vec->insertElementAt(val, i, *fStatus);
1034 }
1035
1036
1037
1038 //-----------------------------------------------------------------------------
1039 //
1040 // setAdd Set operation on UVector
1041 // dest = dest union source
1042 // Elements may only appear once and must be sorted.
1043 //
1044 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1045 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1046 U_ASSERT(!dest->hasDeleter());
1047 U_ASSERT(!source->hasDeleter());
1048 int32_t destOriginalSize = dest->size();
1049 int32_t sourceSize = source->size();
1050 int32_t di = 0;
1051 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
1052 void **destPtr, **sourcePtr;
1053 void **destLim, **sourceLim;
1054
1055 if (destOriginalSize > destArray.getCapacity()) {
1056 if (destArray.resize(destOriginalSize) == NULL) {
1057 return;
1058 }
1059 }
1060 destPtr = destArray.getAlias();
1061 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1062
1063 if (sourceSize > sourceArray.getCapacity()) {
1064 if (sourceArray.resize(sourceSize) == NULL) {
1065 return;
1066 }
1067 }
1068 sourcePtr = sourceArray.getAlias();
1069 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1070
1071 // Avoid multiple "get element" calls by getting the contents into arrays
1072 (void) dest->toArray(destPtr);
1073 (void) source->toArray(sourcePtr);
1074
1075 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1076 if (U_FAILURE(*fStatus)) {
1077 return;
1078 }
1079
1080 while (sourcePtr < sourceLim && destPtr < destLim) {
1081 if (*destPtr == *sourcePtr) {
1082 dest->setElementAt(*sourcePtr++, di++);
1083 destPtr++;
1084 }
1085 // This check is required for machines with segmented memory, like i5/OS.
1086 // Direct pointer comparison is not recommended.
1087 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1088 dest->setElementAt(*destPtr++, di++);
1089 }
1090 else { /* *sourcePtr < *destPtr */
1091 dest->setElementAt(*sourcePtr++, di++);
1092 }
1093 }
1094
1095 // At most one of these two cleanup loops will execute
1096 while (destPtr < destLim) {
1097 dest->setElementAt(*destPtr++, di++);
1098 }
1099 while (sourcePtr < sourceLim) {
1100 dest->setElementAt(*sourcePtr++, di++);
1101 }
1102
1103 dest->setSize(di, *fStatus);
1104 }
1105
1106
1107
1108 //-----------------------------------------------------------------------------
1109 //
1110 // setEqual Set operation on UVector.
1111 // Compare for equality.
1112 // Elements must be sorted.
1113 //
1114 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1115 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1116 return a->equals(*b);
1117 }
1118
1119
1120 //-----------------------------------------------------------------------------
1121 //
1122 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1123 // for each node in the tree.
1124 //
1125 //-----------------------------------------------------------------------------
1126 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1127 void RBBITableBuilder::printPosSets(RBBINode *n) {
1128 if (n==NULL) {
1129 return;
1130 }
1131 printf("\n");
1132 RBBINode::printNodeHeader();
1133 RBBINode::printNode(n);
1134 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1135
1136 RBBIDebugPrintf(" firstpos: ");
1137 printSet(n->fFirstPosSet);
1138
1139 RBBIDebugPrintf(" lastpos: ");
1140 printSet(n->fLastPosSet);
1141
1142 RBBIDebugPrintf(" followpos: ");
1143 printSet(n->fFollowPos);
1144
1145 printPosSets(n->fLeftChild);
1146 printPosSets(n->fRightChild);
1147 }
1148 #endif
1149
1150 //
1151 // findDuplCharClassFrom()
1152 //
findDuplCharClassFrom(IntPair * categories)1153 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1154 int32_t numStates = fDStates->size();
1155 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1156
1157 for (; categories->first < numCols-1; categories->first++) {
1158 // Note: dictionary & non-dictionary columns cannot be merged.
1159 // The limitSecond value prevents considering mixed pairs.
1160 // Dictionary categories are >= DictCategoriesStart.
1161 // Non dict categories are < DictCategoriesStart.
1162 int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1163 fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1164 for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1165 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1166 uint16_t table_base = 0;
1167 uint16_t table_dupl = 1;
1168 for (int32_t state=0; state<numStates; state++) {
1169 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1170 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1171 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1172 if (table_base != table_dupl) {
1173 break;
1174 }
1175 }
1176 if (table_base == table_dupl) {
1177 return true;
1178 }
1179 }
1180 }
1181 return false;
1182 }
1183
1184
1185 //
1186 // removeColumn()
1187 //
removeColumn(int32_t column)1188 void RBBITableBuilder::removeColumn(int32_t column) {
1189 int32_t numStates = fDStates->size();
1190 for (int32_t state=0; state<numStates; state++) {
1191 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1192 U_ASSERT(column < sd->fDtran->size());
1193 sd->fDtran->removeElementAt(column);
1194 }
1195 }
1196
1197 /*
1198 * findDuplicateState
1199 */
findDuplicateState(IntPair * states)1200 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1201 int32_t numStates = fDStates->size();
1202 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1203
1204 for (; states->first<numStates-1; states->first++) {
1205 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1206 for (states->second=states->first+1; states->second<numStates; states->second++) {
1207 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1208 if (firstSD->fAccepting != duplSD->fAccepting ||
1209 firstSD->fLookAhead != duplSD->fLookAhead ||
1210 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1211 continue;
1212 }
1213 bool rowsMatch = true;
1214 for (int32_t col=0; col < numCols; ++col) {
1215 int32_t firstVal = firstSD->fDtran->elementAti(col);
1216 int32_t duplVal = duplSD->fDtran->elementAti(col);
1217 if (!((firstVal == duplVal) ||
1218 ((firstVal == states->first || firstVal == states->second) &&
1219 (duplVal == states->first || duplVal == states->second)))) {
1220 rowsMatch = false;
1221 break;
1222 }
1223 }
1224 if (rowsMatch) {
1225 return true;
1226 }
1227 }
1228 }
1229 return false;
1230 }
1231
1232
findDuplicateSafeState(IntPair * states)1233 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1234 int32_t numStates = fSafeTable->size();
1235
1236 for (; states->first<numStates-1; states->first++) {
1237 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1238 for (states->second=states->first+1; states->second<numStates; states->second++) {
1239 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1240 bool rowsMatch = true;
1241 int32_t numCols = firstRow->length();
1242 for (int32_t col=0; col < numCols; ++col) {
1243 int32_t firstVal = firstRow->charAt(col);
1244 int32_t duplVal = duplRow->charAt(col);
1245 if (!((firstVal == duplVal) ||
1246 ((firstVal == states->first || firstVal == states->second) &&
1247 (duplVal == states->first || duplVal == states->second)))) {
1248 rowsMatch = false;
1249 break;
1250 }
1251 }
1252 if (rowsMatch) {
1253 return true;
1254 }
1255 }
1256 }
1257 return false;
1258 }
1259
1260
removeState(IntPair duplStates)1261 void RBBITableBuilder::removeState(IntPair duplStates) {
1262 const int32_t keepState = duplStates.first;
1263 const int32_t duplState = duplStates.second;
1264 U_ASSERT(keepState < duplState);
1265 U_ASSERT(duplState < fDStates->size());
1266
1267 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1268 fDStates->removeElementAt(duplState);
1269 delete duplSD;
1270
1271 int32_t numStates = fDStates->size();
1272 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1273 for (int32_t state=0; state<numStates; ++state) {
1274 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1275 for (int32_t col=0; col<numCols; col++) {
1276 int32_t existingVal = sd->fDtran->elementAti(col);
1277 int32_t newVal = existingVal;
1278 if (existingVal == duplState) {
1279 newVal = keepState;
1280 } else if (existingVal > duplState) {
1281 newVal = existingVal - 1;
1282 }
1283 sd->fDtran->setElementAt(newVal, col);
1284 }
1285 }
1286 }
1287
removeSafeState(IntPair duplStates)1288 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1289 const int32_t keepState = duplStates.first;
1290 const int32_t duplState = duplStates.second;
1291 U_ASSERT(keepState < duplState);
1292 U_ASSERT(duplState < fSafeTable->size());
1293
1294 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1295 // and will auto-delete the removed element.
1296 int32_t numStates = fSafeTable->size();
1297 for (int32_t state=0; state<numStates; ++state) {
1298 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1299 int32_t numCols = sd->length();
1300 for (int32_t col=0; col<numCols; col++) {
1301 int32_t existingVal = sd->charAt(col);
1302 int32_t newVal = existingVal;
1303 if (existingVal == duplState) {
1304 newVal = keepState;
1305 } else if (existingVal > duplState) {
1306 newVal = existingVal - 1;
1307 }
1308 sd->setCharAt(col, static_cast<char16_t>(newVal));
1309 }
1310 }
1311 }
1312
1313
1314 /*
1315 * RemoveDuplicateStates
1316 */
removeDuplicateStates()1317 int32_t RBBITableBuilder::removeDuplicateStates() {
1318 IntPair dupls = {3, 0};
1319 int32_t numStatesRemoved = 0;
1320
1321 while (findDuplicateState(&dupls)) {
1322 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1323 removeState(dupls);
1324 ++numStatesRemoved;
1325 }
1326 return numStatesRemoved;
1327 }
1328
1329
1330 //-----------------------------------------------------------------------------
1331 //
1332 // getTableSize() Calculate the size of the runtime form of this
1333 // state transition table.
1334 //
1335 //-----------------------------------------------------------------------------
getTableSize() const1336 int32_t RBBITableBuilder::getTableSize() const {
1337 int32_t size = 0;
1338 int32_t numRows;
1339 int32_t numCols;
1340 int32_t rowSize;
1341
1342 if (fTree == NULL) {
1343 return 0;
1344 }
1345
1346 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1347
1348 numRows = fDStates->size();
1349 numCols = fRB->fSetBuilder->getNumCharCategories();
1350
1351 if (use8BitsForTable()) {
1352 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1353 } else {
1354 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1355 }
1356 size += numRows * rowSize;
1357 return size;
1358 }
1359
use8BitsForTable() const1360 bool RBBITableBuilder::use8BitsForTable() const {
1361 return fDStates->size() <= kMaxStateFor8BitsTable;
1362 }
1363
1364 //-----------------------------------------------------------------------------
1365 //
1366 // exportTable() export the state transition table in the format required
1367 // by the runtime engine. getTableSize() bytes of memory
1368 // must be available at the output address "where".
1369 //
1370 //-----------------------------------------------------------------------------
exportTable(void * where)1371 void RBBITableBuilder::exportTable(void *where) {
1372 RBBIStateTable *table = (RBBIStateTable *)where;
1373 uint32_t state;
1374 int col;
1375
1376 if (U_FAILURE(*fStatus) || fTree == NULL) {
1377 return;
1378 }
1379
1380 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1381 if (catCount > 0x7fff ||
1382 fDStates->size() > 0x7fff) {
1383 *fStatus = U_BRK_INTERNAL_ERROR;
1384 return;
1385 }
1386
1387 table->fNumStates = fDStates->size();
1388 table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1389 table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1390 table->fFlags = 0;
1391 if (use8BitsForTable()) {
1392 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1393 table->fFlags |= RBBI_8BITS_ROWS;
1394 } else {
1395 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1396 }
1397 if (fRB->fLookAheadHardBreak) {
1398 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1399 }
1400 if (fRB->fSetBuilder->sawBOF()) {
1401 table->fFlags |= RBBI_BOF_REQUIRED;
1402 }
1403
1404 for (state=0; state<table->fNumStates; state++) {
1405 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1406 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1407 if (use8BitsForTable()) {
1408 U_ASSERT (sd->fAccepting <= 255);
1409 U_ASSERT (sd->fLookAhead <= 255);
1410 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1411 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1412 r8->fAccepting = sd->fAccepting;
1413 r8->fLookAhead = sd->fLookAhead;
1414 r8->fTagsIdx = sd->fTagsIdx;
1415 for (col=0; col<catCount; col++) {
1416 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1417 r8->fNextState[col] = sd->fDtran->elementAti(col);
1418 }
1419 } else {
1420 U_ASSERT (sd->fAccepting <= 0xffff);
1421 U_ASSERT (sd->fLookAhead <= 0xffff);
1422 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1423 row->r16.fAccepting = sd->fAccepting;
1424 row->r16.fLookAhead = sd->fLookAhead;
1425 row->r16.fTagsIdx = sd->fTagsIdx;
1426 for (col=0; col<catCount; col++) {
1427 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1428 }
1429 }
1430 }
1431 }
1432
1433
1434 /**
1435 * Synthesize a safe state table from the main state table.
1436 */
buildSafeReverseTable(UErrorCode & status)1437 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1438 // The safe table creation has three steps:
1439
1440 // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1441 // following the pair do not depend on context or state before the pair. To test
1442 // whether a pair is safe, run it through the main forward state table, starting
1443 // from each state. If the the final state is the same, no matter what the starting state,
1444 // the pair is safe.
1445 //
1446 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1447 // forward table, with a column for each input character [class], and a row for
1448 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1449 // create an additional state for each input character category; being in
1450 // one of these states means that the character has been seen, and is potentially
1451 // the first of a pair. In each of these rows, the entry for the second character
1452 // of a safe pair is set to the stop state (0), indicating that a match was found.
1453 // All other table entries are set to the state corresponding the current input
1454 // character, allowing that character to be the of a start following pair.
1455 //
1456 // Because the safe rules are to be run in reverse, moving backwards in the text,
1457 // the first and second pair categories are swapped when building the table.
1458 //
1459 // 3. Compress the table. There are typically many rows (states) that are
1460 // equivalent - that have zeroes (match completed) in the same columns -
1461 // and can be folded together.
1462
1463 // Each safe pair is stored as two UChars in the safePair string.
1464 UnicodeString safePairs;
1465
1466 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1467 int32_t numStates = fDStates->size();
1468
1469 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1470 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1471 int32_t wantedEndState = -1;
1472 int32_t endState = 0;
1473 for (int32_t startState = 1; startState < numStates; ++startState) {
1474 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1475 int32_t s2 = startStateD->fDtran->elementAti(c1);
1476 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1477 endState = s2StateD->fDtran->elementAti(c2);
1478 if (wantedEndState < 0) {
1479 wantedEndState = endState;
1480 } else {
1481 if (wantedEndState != endState) {
1482 break;
1483 }
1484 }
1485 }
1486 if (wantedEndState == endState) {
1487 safePairs.append((char16_t)c1);
1488 safePairs.append((char16_t)c2);
1489 // printf("(%d, %d) ", c1, c2);
1490 }
1491 }
1492 // printf("\n");
1493 }
1494
1495 // Populate the initial safe table.
1496 // The table as a whole is UVector<UnicodeString>
1497 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1498 // Row 0 is the stop state.
1499 // Row 1 is the start state.
1500 // Row 2 and beyond are other states, initially one per char class, but
1501 // after initial construction, many of the states will be combined, compacting the table.
1502 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1503 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1504
1505 U_ASSERT(fSafeTable == nullptr);
1506 LocalPointer<UVector> lpSafeTable(
1507 new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1508 if (U_FAILURE(status)) {
1509 return;
1510 }
1511 fSafeTable = lpSafeTable.orphan();
1512 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1513 LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1514 fSafeTable->adoptElement(lpString.orphan(), status);
1515 }
1516 if (U_FAILURE(status)) {
1517 return;
1518 }
1519
1520 // From the start state, each input char class transitions to the state for that input.
1521 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1522 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1523 // Note: +2 for the start & stop state.
1524 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1525 }
1526
1527 // Initially make every other state table row look like the start state row,
1528 for (int32_t row=2; row<numCharClasses+2; ++row) {
1529 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1530 rowState = startState; // UnicodeString assignment, copies contents.
1531 }
1532
1533 // Run through the safe pairs, set the next state to zero when pair has been seen.
1534 // Zero being the stop state, meaning we found a safe point.
1535 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1536 int32_t c1 = safePairs.charAt(pairIdx);
1537 int32_t c2 = safePairs.charAt(pairIdx + 1);
1538
1539 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1540 rowState.setCharAt(c1, 0);
1541 }
1542
1543 // Remove duplicate or redundant rows from the table.
1544 IntPair states = {1, 0};
1545 while (findDuplicateSafeState(&states)) {
1546 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1547 removeSafeState(states);
1548 }
1549 }
1550
1551
1552 //-----------------------------------------------------------------------------
1553 //
1554 // getSafeTableSize() Calculate the size of the runtime form of this
1555 // safe state table.
1556 //
1557 //-----------------------------------------------------------------------------
getSafeTableSize() const1558 int32_t RBBITableBuilder::getSafeTableSize() const {
1559 int32_t size = 0;
1560 int32_t numRows;
1561 int32_t numCols;
1562 int32_t rowSize;
1563
1564 if (fSafeTable == nullptr) {
1565 return 0;
1566 }
1567
1568 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1569
1570 numRows = fSafeTable->size();
1571 numCols = fRB->fSetBuilder->getNumCharCategories();
1572
1573 if (use8BitsForSafeTable()) {
1574 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1575 } else {
1576 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1577 }
1578 size += numRows * rowSize;
1579 return size;
1580 }
1581
use8BitsForSafeTable() const1582 bool RBBITableBuilder::use8BitsForSafeTable() const {
1583 return fSafeTable->size() <= kMaxStateFor8BitsTable;
1584 }
1585
1586 //-----------------------------------------------------------------------------
1587 //
1588 // exportSafeTable() export the state transition table in the format required
1589 // by the runtime engine. getTableSize() bytes of memory
1590 // must be available at the output address "where".
1591 //
1592 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1593 void RBBITableBuilder::exportSafeTable(void *where) {
1594 RBBIStateTable *table = (RBBIStateTable *)where;
1595 uint32_t state;
1596 int col;
1597
1598 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1599 return;
1600 }
1601
1602 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1603 if (catCount > 0x7fff ||
1604 fSafeTable->size() > 0x7fff) {
1605 *fStatus = U_BRK_INTERNAL_ERROR;
1606 return;
1607 }
1608
1609 table->fNumStates = fSafeTable->size();
1610 table->fFlags = 0;
1611 if (use8BitsForSafeTable()) {
1612 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1613 table->fFlags |= RBBI_8BITS_ROWS;
1614 } else {
1615 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1616 }
1617
1618 for (state=0; state<table->fNumStates; state++) {
1619 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1620 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1621 if (use8BitsForSafeTable()) {
1622 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1623 r8->fAccepting = 0;
1624 r8->fLookAhead = 0;
1625 r8->fTagsIdx = 0;
1626 for (col=0; col<catCount; col++) {
1627 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1628 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1629 }
1630 } else {
1631 row->r16.fAccepting = 0;
1632 row->r16.fLookAhead = 0;
1633 row->r16.fTagsIdx = 0;
1634 for (col=0; col<catCount; col++) {
1635 row->r16.fNextState[col] = rowString->charAt(col);
1636 }
1637 }
1638 }
1639 }
1640
1641
1642
1643
1644 //-----------------------------------------------------------------------------
1645 //
1646 // printSet Debug function. Print the contents of a UVector
1647 //
1648 //-----------------------------------------------------------------------------
1649 #ifdef RBBI_DEBUG
printSet(UVector * s)1650 void RBBITableBuilder::printSet(UVector *s) {
1651 int32_t i;
1652 for (i=0; i<s->size(); i++) {
1653 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1654 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1655 }
1656 RBBIDebugPrintf("\n");
1657 }
1658 #endif
1659
1660
1661 //-----------------------------------------------------------------------------
1662 //
1663 // printStates Debug Function. Dump the fully constructed state transition table.
1664 //
1665 //-----------------------------------------------------------------------------
1666 #ifdef RBBI_DEBUG
printStates()1667 void RBBITableBuilder::printStates() {
1668 int c; // input "character"
1669 int n; // state number
1670
1671 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1672 RBBIDebugPrintf(" | Acc LA Tag");
1673 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1674 RBBIDebugPrintf(" %3d", c);
1675 }
1676 RBBIDebugPrintf("\n");
1677 RBBIDebugPrintf(" |---------------");
1678 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1679 RBBIDebugPrintf("----");
1680 }
1681 RBBIDebugPrintf("\n");
1682
1683 for (n=0; n<fDStates->size(); n++) {
1684 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1685 RBBIDebugPrintf(" %3d | " , n);
1686 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1687 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1688 RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1689 }
1690 RBBIDebugPrintf("\n");
1691 }
1692 RBBIDebugPrintf("\n\n");
1693 }
1694 #endif
1695
1696
1697 //-----------------------------------------------------------------------------
1698 //
1699 // printSafeTable Debug Function. Dump the fully constructed safe table.
1700 //
1701 //-----------------------------------------------------------------------------
1702 #ifdef RBBI_DEBUG
printReverseTable()1703 void RBBITableBuilder::printReverseTable() {
1704 int c; // input "character"
1705 int n; // state number
1706
1707 RBBIDebugPrintf(" Safe Reverse Table \n");
1708 if (fSafeTable == nullptr) {
1709 RBBIDebugPrintf(" --- nullptr ---\n");
1710 return;
1711 }
1712 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1713 RBBIDebugPrintf(" | Acc LA Tag");
1714 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1715 RBBIDebugPrintf(" %2d", c);
1716 }
1717 RBBIDebugPrintf("\n");
1718 RBBIDebugPrintf(" |---------------");
1719 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1720 RBBIDebugPrintf("---");
1721 }
1722 RBBIDebugPrintf("\n");
1723
1724 for (n=0; n<fSafeTable->size(); n++) {
1725 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1726 RBBIDebugPrintf(" %3d | " , n);
1727 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1728 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1729 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1730 }
1731 RBBIDebugPrintf("\n");
1732 }
1733 RBBIDebugPrintf("\n\n");
1734 }
1735 #endif
1736
1737
1738
1739 //-----------------------------------------------------------------------------
1740 //
1741 // printRuleStatusTable Debug Function. Dump the common rule status table
1742 //
1743 //-----------------------------------------------------------------------------
1744 #ifdef RBBI_DEBUG
printRuleStatusTable()1745 void RBBITableBuilder::printRuleStatusTable() {
1746 int32_t thisRecord = 0;
1747 int32_t nextRecord = 0;
1748 int i;
1749 UVector *tbl = fRB->fRuleStatusVals;
1750
1751 RBBIDebugPrintf("index | tags \n");
1752 RBBIDebugPrintf("-------------------\n");
1753
1754 while (nextRecord < tbl->size()) {
1755 thisRecord = nextRecord;
1756 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1757 RBBIDebugPrintf("%4d ", thisRecord);
1758 for (i=thisRecord+1; i<nextRecord; i++) {
1759 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1760 }
1761 RBBIDebugPrintf("\n");
1762 }
1763 RBBIDebugPrintf("\n\n");
1764 }
1765 #endif
1766
1767
1768 //-----------------------------------------------------------------------------
1769 //
1770 // RBBIStateDescriptor Methods. This is a very struct-like class
1771 // Most access is directly to the fields.
1772 //
1773 //-----------------------------------------------------------------------------
1774
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1775 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1776 fMarked = FALSE;
1777 fAccepting = 0;
1778 fLookAhead = 0;
1779 fTagsIdx = 0;
1780 fTagVals = NULL;
1781 fPositions = NULL;
1782 fDtran = NULL;
1783
1784 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1785 if (U_FAILURE(*fStatus)) {
1786 return;
1787 }
1788 if (fDtran == NULL) {
1789 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1790 return;
1791 }
1792 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1793 // It is indexed by input symbols, and will
1794 // hold the next state number for each
1795 // symbol.
1796 }
1797
1798
~RBBIStateDescriptor()1799 RBBIStateDescriptor::~RBBIStateDescriptor() {
1800 delete fPositions;
1801 delete fDtran;
1802 delete fTagVals;
1803 fPositions = NULL;
1804 fDtran = NULL;
1805 fTagVals = NULL;
1806 }
1807
1808 U_NAMESPACE_END
1809
1810 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1811