1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28
29 U_NAMESPACE_BEGIN
30
31 const int32_t kMaxStateFor8BitsTable = 255;
32
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34 fRB(rb),
35 fTree(*rootNode),
36 fStatus(&status),
37 fDStates(nullptr),
38 fSafeTable(nullptr) {
39 if (U_FAILURE(status)) {
40 return;
41 }
42 // fDStates is UVector<RBBIStateDescriptor *>
43 fDStates = new UVector(status);
44 if (U_SUCCESS(status) && fDStates == nullptr ) {
45 status = U_MEMORY_ALLOCATION_ERROR;
46 }
47 }
48
49
50
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52 int i;
53 for (i=0; i<fDStates->size(); i++) {
54 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55 }
56 delete fDStates;
57 delete fSafeTable;
58 delete fLookAheadRuleMap;
59 }
60
61
62 //-----------------------------------------------------------------------------
63 //
64 // RBBITableBuilder::buildForwardTable - This is the main function for building
65 // the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void RBBITableBuilder::buildForwardTable() {
69
70 if (U_FAILURE(*fStatus)) {
71 return;
72 }
73
74 // If there were no rules, just return. This situation can easily arise
75 // for the reverse rules.
76 if (fTree==nullptr) {
77 return;
78 }
79
80 //
81 // Walk through the tree, replacing any references to $variables with a copy of the
82 // parse tree for the substitution expression.
83 //
84 fTree = fTree->flattenVariables(*fStatus, 0);
85 if (U_FAILURE(*fStatus)) {
86 return;
87 }
88 #ifdef RBBI_DEBUG
89 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
90 RBBIDebugPuts("\nParse tree after flattening variable references.");
91 RBBINode::printTree(fTree, true);
92 }
93 #endif
94
95 //
96 // If the rules contained any references to {bof}
97 // add a {bof} <cat> <former root of tree> to the
98 // tree. Means that all matches must start out with the
99 // {bof} fake character.
100 //
101 if (fRB->fSetBuilder->sawBOF()) {
102 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
103 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
104 // Delete and exit if memory allocation failed.
105 if (bofTop == nullptr || bofLeaf == nullptr) {
106 *fStatus = U_MEMORY_ALLOCATION_ERROR;
107 delete bofTop;
108 delete bofLeaf;
109 return;
110 }
111 bofTop->fLeftChild = bofLeaf;
112 bofTop->fRightChild = fTree;
113 bofLeaf->fParent = bofTop;
114 bofLeaf->fVal = 2; // Reserved value for {bof}.
115 fTree = bofTop;
116 }
117
118 //
119 // Add a unique right-end marker to the expression.
120 // Appears as a cat-node, left child being the original tree,
121 // right child being the end marker.
122 //
123 RBBINode *cn = new RBBINode(RBBINode::opCat);
124 // Exit if memory allocation failed.
125 if (cn == nullptr) {
126 *fStatus = U_MEMORY_ALLOCATION_ERROR;
127 return;
128 }
129 cn->fLeftChild = fTree;
130 fTree->fParent = cn;
131 RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
132 // Delete and exit if memory allocation failed.
133 if (cn->fRightChild == nullptr) {
134 *fStatus = U_MEMORY_ALLOCATION_ERROR;
135 delete cn;
136 return;
137 }
138 cn->fRightChild->fParent = cn;
139 fTree = cn;
140
141 //
142 // Replace all references to UnicodeSets with the tree for the equivalent
143 // expression.
144 //
145 fTree->flattenSets();
146 #ifdef RBBI_DEBUG
147 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
148 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
149 RBBINode::printTree(fTree, true);
150 }
151 #endif
152
153
154 //
155 // calculate the functions nullable, firstpos, lastpos and followpos on
156 // nodes in the parse tree.
157 // See the algorithm description in Aho.
158 // Understanding how this works by looking at the code alone will be
159 // nearly impossible.
160 //
161 calcNullable(fTree);
162 calcFirstPos(fTree);
163 calcLastPos(fTree);
164 calcFollowPos(fTree);
165 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
166 RBBIDebugPuts("\n");
167 printPosSets(fTree);
168 }
169
170 //
171 // For "chained" rules, modify the followPos sets
172 //
173 if (fRB->fChainRules) {
174 calcChainedFollowPos(fTree, endMarkerNode);
175 }
176
177 //
178 // BOF (start of input) test fixup.
179 //
180 if (fRB->fSetBuilder->sawBOF()) {
181 bofFixup();
182 }
183
184 //
185 // Build the DFA state transition tables.
186 //
187 buildStateTable();
188 mapLookAheadRules();
189 flagAcceptingStates();
190 flagLookAheadStates();
191 flagTaggedStates();
192
193 //
194 // Update the global table of rule status {tag} values
195 // The rule builder has a global vector of status values that are common
196 // for all tables. Merge the ones from this table into the global set.
197 //
198 mergeRuleStatusVals();
199 }
200
201
202
203 //-----------------------------------------------------------------------------
204 //
205 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
206 //
207 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)208 void RBBITableBuilder::calcNullable(RBBINode *n) {
209 if (n == nullptr) {
210 return;
211 }
212 if (n->fType == RBBINode::setRef ||
213 n->fType == RBBINode::endMark ) {
214 // These are non-empty leaf node types.
215 n->fNullable = false;
216 return;
217 }
218
219 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
220 // Lookahead marker node. It's a leaf, so no recursion on children.
221 // It's nullable because it does not match any literal text from the input stream.
222 n->fNullable = true;
223 return;
224 }
225
226
227 // The node is not a leaf.
228 // Calculate nullable on its children.
229 calcNullable(n->fLeftChild);
230 calcNullable(n->fRightChild);
231
232 // Apply functions from table 3.40 in Aho
233 if (n->fType == RBBINode::opOr) {
234 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
235 }
236 else if (n->fType == RBBINode::opCat) {
237 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
238 }
239 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
240 n->fNullable = true;
241 }
242 else {
243 n->fNullable = false;
244 }
245 }
246
247
248
249
250 //-----------------------------------------------------------------------------
251 //
252 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
253 //
254 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)255 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
256 if (n == nullptr) {
257 return;
258 }
259 if (n->fType == RBBINode::leafChar ||
260 n->fType == RBBINode::endMark ||
261 n->fType == RBBINode::lookAhead ||
262 n->fType == RBBINode::tag) {
263 // These are non-empty leaf node types.
264 // Note: In order to maintain the sort invariant on the set,
265 // this function should only be called on a node whose set is
266 // empty to start with.
267 n->fFirstPosSet->addElement(n, *fStatus);
268 return;
269 }
270
271 // The node is not a leaf.
272 // Calculate firstPos on its children.
273 calcFirstPos(n->fLeftChild);
274 calcFirstPos(n->fRightChild);
275
276 // Apply functions from table 3.40 in Aho
277 if (n->fType == RBBINode::opOr) {
278 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
279 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
280 }
281 else if (n->fType == RBBINode::opCat) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 if (n->fLeftChild->fNullable) {
284 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
285 }
286 }
287 else if (n->fType == RBBINode::opStar ||
288 n->fType == RBBINode::opQuestion ||
289 n->fType == RBBINode::opPlus) {
290 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
291 }
292 }
293
294
295
296 //-----------------------------------------------------------------------------
297 //
298 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
299 //
300 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)301 void RBBITableBuilder::calcLastPos(RBBINode *n) {
302 if (n == nullptr) {
303 return;
304 }
305 if (n->fType == RBBINode::leafChar ||
306 n->fType == RBBINode::endMark ||
307 n->fType == RBBINode::lookAhead ||
308 n->fType == RBBINode::tag) {
309 // These are non-empty leaf node types.
310 // Note: In order to maintain the sort invariant on the set,
311 // this function should only be called on a node whose set is
312 // empty to start with.
313 n->fLastPosSet->addElement(n, *fStatus);
314 return;
315 }
316
317 // The node is not a leaf.
318 // Calculate lastPos on its children.
319 calcLastPos(n->fLeftChild);
320 calcLastPos(n->fRightChild);
321
322 // Apply functions from table 3.40 in Aho
323 if (n->fType == RBBINode::opOr) {
324 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
325 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326 }
327 else if (n->fType == RBBINode::opCat) {
328 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
329 if (n->fRightChild->fNullable) {
330 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
331 }
332 }
333 else if (n->fType == RBBINode::opStar ||
334 n->fType == RBBINode::opQuestion ||
335 n->fType == RBBINode::opPlus) {
336 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
337 }
338 }
339
340
341
342 //-----------------------------------------------------------------------------
343 //
344 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
345 //
346 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)347 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
348 if (n == nullptr ||
349 n->fType == RBBINode::leafChar ||
350 n->fType == RBBINode::endMark) {
351 return;
352 }
353
354 calcFollowPos(n->fLeftChild);
355 calcFollowPos(n->fRightChild);
356
357 // Aho rule #1
358 if (n->fType == RBBINode::opCat) {
359 RBBINode *i; // is 'i' in Aho's description
360 uint32_t ix;
361
362 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
363
364 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
365 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
366 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
367 }
368 }
369
370 // Aho rule #2
371 if (n->fType == RBBINode::opStar ||
372 n->fType == RBBINode::opPlus) {
373 RBBINode *i; // again, n and i are the names from Aho's description.
374 uint32_t ix;
375
376 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
377 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
378 setAdd(i->fFollowPos, n->fFirstPosSet);
379 }
380 }
381
382
383
384 }
385
386 //-----------------------------------------------------------------------------
387 //
388 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
389 // as roots of a rule to a destination vector.
390 //
391 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)392 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
393 if (node == nullptr || U_FAILURE(*fStatus)) {
394 return;
395 }
396 U_ASSERT(!dest->hasDeleter());
397 if (node->fRuleRoot) {
398 dest->addElement(node, *fStatus);
399 // Note: rules cannot nest. If we found a rule start node,
400 // no child node can also be a start node.
401 return;
402 }
403 addRuleRootNodes(dest, node->fLeftChild);
404 addRuleRootNodes(dest, node->fRightChild);
405 }
406
407 //-----------------------------------------------------------------------------
408 //
409 // calcChainedFollowPos. Modify the previously calculated followPos sets
410 // to implement rule chaining. NOT described by Aho
411 //
412 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)413 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
414
415 UVector leafNodes(*fStatus);
416 if (U_FAILURE(*fStatus)) {
417 return;
418 }
419
420 // get a list all leaf nodes
421 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
422 if (U_FAILURE(*fStatus)) {
423 return;
424 }
425
426 // Collect all leaf nodes that can start matches for rules
427 // with inbound chaining enabled, which is the union of the
428 // firstPosition sets from each of the rule root nodes.
429
430 UVector ruleRootNodes(*fStatus);
431 addRuleRootNodes(&ruleRootNodes, tree);
432
433 UVector matchStartNodes(*fStatus);
434 for (int j=0; j<ruleRootNodes.size(); ++j) {
435 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
436 if (node->fChainIn) {
437 setAdd(&matchStartNodes, node->fFirstPosSet);
438 }
439 }
440 if (U_FAILURE(*fStatus)) {
441 return;
442 }
443
444 int32_t endNodeIx;
445 int32_t startNodeIx;
446
447 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
448 RBBINode *endNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
449
450 // Identify leaf nodes that correspond to overall rule match positions.
451 // These include the endMarkNode in their followPos sets.
452 //
453 // Note: do not consider other end marker nodes, those that are added to
454 // look-ahead rules. These can't chain; a match immediately stops
455 // further matching. This leaves exactly one end marker node, the one
456 // at the end of the complete tree.
457
458 if (!endNode->fFollowPos->contains(endMarkNode)) {
459 continue;
460 }
461
462 // We've got a node that can end a match.
463
464 // Now iterate over the nodes that can start a match, looking for ones
465 // with the same char class as our ending node.
466 RBBINode *startNode;
467 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
468 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
469 if (startNode->fType != RBBINode::leafChar) {
470 continue;
471 }
472
473 if (endNode->fVal == startNode->fVal) {
474 // The end val (character class) of one possible match is the
475 // same as the start of another.
476
477 // Add all nodes from the followPos of the start node to the
478 // followPos set of the end node, which will have the effect of
479 // letting matches transition from a match state at endNode
480 // to the second char of a match starting with startNode.
481 setAdd(endNode->fFollowPos, startNode->fFollowPos);
482 }
483 }
484 }
485 }
486
487
488 //-----------------------------------------------------------------------------
489 //
490 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
491 // Do an swizzle similar to chaining, modifying the followPos set of
492 // the bofNode to include the followPos nodes from other {bot} nodes
493 // scattered through the tree.
494 //
495 // This function has much in common with calcChainedFollowPos().
496 //
497 //-----------------------------------------------------------------------------
bofFixup()498 void RBBITableBuilder::bofFixup() {
499
500 if (U_FAILURE(*fStatus)) {
501 return;
502 }
503
504 // The parse tree looks like this ...
505 // fTree root ---> <cat>
506 // / \ .
507 // <cat> <#end node>
508 // / \ .
509 // <bofNode> rest
510 // of tree
511 //
512 // We will be adding things to the followPos set of the <bofNode>
513 //
514 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
515 U_ASSERT(bofNode->fType == RBBINode::leafChar);
516 U_ASSERT(bofNode->fVal == 2);
517
518 // Get all nodes that can be the start a match of the user-written rules
519 // (excluding the fake bofNode)
520 // We want the nodes that can start a match in the
521 // part labeled "rest of tree"
522 //
523 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
524
525 RBBINode *startNode;
526 int startNodeIx;
527 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
528 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
529 if (startNode->fType != RBBINode::leafChar) {
530 continue;
531 }
532
533 if (startNode->fVal == bofNode->fVal) {
534 // We found a leaf node corresponding to a {bof} that was
535 // explicitly written into a rule.
536 // Add everything from the followPos set of this node to the
537 // followPos set of the fake bofNode at the start of the tree.
538 //
539 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
540 }
541 }
542 }
543
544 //-----------------------------------------------------------------------------
545 //
546 // buildStateTable() Determine the set of runtime DFA states and the
547 // transition tables for these states, by the algorithm
548 // of fig. 3.44 in Aho.
549 //
550 // Most of the comments are quotes of Aho's psuedo-code.
551 //
552 //-----------------------------------------------------------------------------
buildStateTable()553 void RBBITableBuilder::buildStateTable() {
554 if (U_FAILURE(*fStatus)) {
555 return;
556 }
557 RBBIStateDescriptor *failState;
558 // Set it to nullptr to avoid uninitialized warning
559 RBBIStateDescriptor *initialState = nullptr;
560 //
561 // Add a dummy state 0 - the stop state. Not from Aho.
562 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
563 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
564 if (failState == nullptr) {
565 *fStatus = U_MEMORY_ALLOCATION_ERROR;
566 goto ExitBuildSTdeleteall;
567 }
568 failState->fPositions = new UVector(*fStatus);
569 if (failState->fPositions == nullptr) {
570 *fStatus = U_MEMORY_ALLOCATION_ERROR;
571 }
572 if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) {
573 goto ExitBuildSTdeleteall;
574 }
575 fDStates->addElement(failState, *fStatus);
576 if (U_FAILURE(*fStatus)) {
577 goto ExitBuildSTdeleteall;
578 }
579
580 // initially, the only unmarked state in Dstates is firstpos(root),
581 // where toot is the root of the syntax tree for (r)#;
582 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
583 if (initialState == nullptr) {
584 *fStatus = U_MEMORY_ALLOCATION_ERROR;
585 }
586 if (U_FAILURE(*fStatus)) {
587 goto ExitBuildSTdeleteall;
588 }
589 initialState->fPositions = new UVector(*fStatus);
590 if (initialState->fPositions == nullptr) {
591 *fStatus = U_MEMORY_ALLOCATION_ERROR;
592 }
593 if (U_FAILURE(*fStatus)) {
594 goto ExitBuildSTdeleteall;
595 }
596 setAdd(initialState->fPositions, fTree->fFirstPosSet);
597 fDStates->addElement(initialState, *fStatus);
598 if (U_FAILURE(*fStatus)) {
599 goto ExitBuildSTdeleteall;
600 }
601
602 // while there is an unmarked state T in Dstates do begin
603 for (;;) {
604 RBBIStateDescriptor *T = nullptr;
605 int32_t tx;
606 for (tx=1; tx<fDStates->size(); tx++) {
607 RBBIStateDescriptor *temp;
608 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
609 if (temp->fMarked == false) {
610 T = temp;
611 break;
612 }
613 }
614 if (T == nullptr) {
615 break;
616 }
617
618 // mark T;
619 T->fMarked = true;
620
621 // for each input symbol a do begin
622 int32_t a;
623 for (a = 1; a<=lastInputSymbol; a++) {
624 // let U be the set of positions that are in followpos(p)
625 // for some position p in T
626 // such that the symbol at position p is a;
627 UVector *U = nullptr;
628 RBBINode *p;
629 int32_t px;
630 for (px=0; px<T->fPositions->size(); px++) {
631 p = (RBBINode *)T->fPositions->elementAt(px);
632 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
633 if (U == nullptr) {
634 U = new UVector(*fStatus);
635 if (U == nullptr) {
636 *fStatus = U_MEMORY_ALLOCATION_ERROR;
637 goto ExitBuildSTdeleteall;
638 }
639 }
640 setAdd(U, p->fFollowPos);
641 }
642 }
643
644 // if U is not empty and not in DStates then
645 int32_t ux = 0;
646 UBool UinDstates = false;
647 if (U != nullptr) {
648 U_ASSERT(U->size() > 0);
649 int ix;
650 for (ix=0; ix<fDStates->size(); ix++) {
651 RBBIStateDescriptor *temp2;
652 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
653 if (setEquals(U, temp2->fPositions)) {
654 delete U;
655 U = temp2->fPositions;
656 ux = ix;
657 UinDstates = true;
658 break;
659 }
660 }
661
662 // Add U as an unmarked state to Dstates
663 if (!UinDstates)
664 {
665 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
666 if (newState == nullptr) {
667 *fStatus = U_MEMORY_ALLOCATION_ERROR;
668 }
669 if (U_FAILURE(*fStatus)) {
670 goto ExitBuildSTdeleteall;
671 }
672 newState->fPositions = U;
673 fDStates->addElement(newState, *fStatus);
674 if (U_FAILURE(*fStatus)) {
675 return;
676 }
677 ux = fDStates->size()-1;
678 }
679
680 // Dtran[T, a] := U;
681 T->fDtran->setElementAt(ux, a);
682 }
683 }
684 }
685 return;
686 // delete local pointers only if error occurred.
687 ExitBuildSTdeleteall:
688 delete initialState;
689 delete failState;
690 }
691
692
693 /**
694 * mapLookAheadRules
695 *
696 */
mapLookAheadRules()697 void RBBITableBuilder::mapLookAheadRules() {
698 fLookAheadRuleMap = new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
699 if (fLookAheadRuleMap == nullptr) {
700 *fStatus = U_MEMORY_ALLOCATION_ERROR;
701 }
702 if (U_FAILURE(*fStatus)) {
703 return;
704 }
705 fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
706
707 for (int32_t n=0; n<fDStates->size(); n++) {
708 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
709 int32_t laSlotForState = 0;
710
711 // Establish the look-ahead slot for this state, if the state covers
712 // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
713
714 // If any of the look-ahead nodes already have a slot assigned, use it,
715 // otherwise assign a new one.
716
717 bool sawLookAheadNode = false;
718 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
719 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
720 if (node->fType != RBBINode::NodeType::lookAhead) {
721 continue;
722 }
723 sawLookAheadNode = true;
724 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
725 U_ASSERT(ruleNum < fLookAheadRuleMap->size());
726 U_ASSERT(ruleNum > 0);
727 int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
728 if (laSlot != 0) {
729 if (laSlotForState == 0) {
730 laSlotForState = laSlot;
731 } else {
732 // TODO: figure out if this can fail, change to setting an error code if so.
733 U_ASSERT(laSlot == laSlotForState);
734 }
735 }
736 }
737 if (!sawLookAheadNode) {
738 continue;
739 }
740
741 if (laSlotForState == 0) {
742 laSlotForState = ++fLASlotsInUse;
743 }
744
745 // For each look ahead node covered by this state,
746 // set the mapping from the node's rule number to the look ahead slot.
747 // There can be multiple nodes/rule numbers going to the same la slot.
748
749 for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
750 RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
751 if (node->fType != RBBINode::NodeType::lookAhead) {
752 continue;
753 }
754 int32_t ruleNum = node->fVal; // Set when rule was originally parsed.
755 int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
756 (void)existingVal;
757 U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
758 fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
759 }
760 }
761
762 }
763
764 //-----------------------------------------------------------------------------
765 //
766 // flagAcceptingStates Identify accepting states.
767 // First get a list of all of the end marker nodes.
768 // Then, for each state s,
769 // if s contains one of the end marker nodes in its list of tree positions then
770 // s is an accepting state.
771 //
772 //-----------------------------------------------------------------------------
flagAcceptingStates()773 void RBBITableBuilder::flagAcceptingStates() {
774 if (U_FAILURE(*fStatus)) {
775 return;
776 }
777 UVector endMarkerNodes(*fStatus);
778 RBBINode *endMarker;
779 int32_t i;
780 int32_t n;
781
782 if (U_FAILURE(*fStatus)) {
783 return;
784 }
785
786 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
787 if (U_FAILURE(*fStatus)) {
788 return;
789 }
790
791 for (i=0; i<endMarkerNodes.size(); i++) {
792 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
793 for (n=0; n<fDStates->size(); n++) {
794 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
795 if (sd->fPositions->indexOf(endMarker) >= 0) {
796 // Any non-zero value for fAccepting means this is an accepting node.
797 // The value is what will be returned to the user as the break status.
798 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
799
800 if (sd->fAccepting==0) {
801 // State hasn't been marked as accepting yet. Do it now.
802 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
803 if (sd->fAccepting == 0) {
804 sd->fAccepting = ACCEPTING_UNCONDITIONAL;
805 }
806 }
807 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
808 // Both lookahead and non-lookahead accepting for this state.
809 // Favor the look-ahead, because a look-ahead match needs to
810 // immediately stop the run-time engine. First match, not longest.
811 sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
812 }
813 // implicit else:
814 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
815 }
816 }
817 }
818 }
819
820
821 //-----------------------------------------------------------------------------
822 //
823 // flagLookAheadStates Very similar to flagAcceptingStates, above.
824 //
825 //-----------------------------------------------------------------------------
flagLookAheadStates()826 void RBBITableBuilder::flagLookAheadStates() {
827 if (U_FAILURE(*fStatus)) {
828 return;
829 }
830 UVector lookAheadNodes(*fStatus);
831 RBBINode *lookAheadNode;
832 int32_t i;
833 int32_t n;
834
835 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
836 if (U_FAILURE(*fStatus)) {
837 return;
838 }
839 for (i=0; i<lookAheadNodes.size(); i++) {
840 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
841 U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
842
843 for (n=0; n<fDStates->size(); n++) {
844 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
845 int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
846 if (positionsIdx >= 0) {
847 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
848 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
849 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
850 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
851 // printf("%s:%d Bingo. sd->fLookAhead:%d lookaheadSlot:%d\n",
852 // __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
853 // }
854 sd->fLookAhead = lookaheadSlot;
855 }
856 }
857 }
858 }
859
860
861
862
863 //-----------------------------------------------------------------------------
864 //
865 // flagTaggedStates
866 //
867 //-----------------------------------------------------------------------------
flagTaggedStates()868 void RBBITableBuilder::flagTaggedStates() {
869 if (U_FAILURE(*fStatus)) {
870 return;
871 }
872 UVector tagNodes(*fStatus);
873 RBBINode *tagNode;
874 int32_t i;
875 int32_t n;
876
877 if (U_FAILURE(*fStatus)) {
878 return;
879 }
880 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
881 if (U_FAILURE(*fStatus)) {
882 return;
883 }
884 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
885 tagNode = (RBBINode *)tagNodes.elementAt(i);
886
887 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
888 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
889 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
890 sortedAdd(&sd->fTagVals, tagNode->fVal);
891 }
892 }
893 }
894 }
895
896
897
898
899 //-----------------------------------------------------------------------------
900 //
901 // mergeRuleStatusVals
902 //
903 // Update the global table of rule status {tag} values
904 // The rule builder has a global vector of status values that are common
905 // for all tables. Merge the ones from this table into the global set.
906 //
907 //-----------------------------------------------------------------------------
mergeRuleStatusVals()908 void RBBITableBuilder::mergeRuleStatusVals() {
909 //
910 // The basic outline of what happens here is this...
911 //
912 // for each state in this state table
913 // if the status tag list for this state is in the global statuses list
914 // record where and
915 // continue with the next state
916 // else
917 // add the tag list for this state to the global list.
918 //
919 int i;
920 int n;
921
922 // Pre-set a single tag of {0} into the table.
923 // We will need this as a default, for rule sets with no explicit tagging.
924 if (fRB->fRuleStatusVals->size() == 0) {
925 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
926 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
927 }
928
929 // For each state
930 for (n=0; n<fDStates->size(); n++) {
931 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
932 UVector *thisStatesTagValues = sd->fTagVals;
933 if (thisStatesTagValues == nullptr) {
934 // No tag values are explicitly associated with this state.
935 // Set the default tag value.
936 sd->fTagsIdx = 0;
937 continue;
938 }
939
940 // There are tag(s) associated with this state.
941 // fTagsIdx will be the index into the global tag list for this state's tag values.
942 // Initial value of -1 flags that we haven't got it set yet.
943 sd->fTagsIdx = -1;
944 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
945 int32_t nextTagGroupStart = 0;
946
947 // Loop runs once per group of tags in the global list
948 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
949 thisTagGroupStart = nextTagGroupStart;
950 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
951 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
952 // The number of tags for this state is different from
953 // the number of tags in this group from the global list.
954 // Continue with the next group from the global list.
955 continue;
956 }
957 // The lengths match, go ahead and compare the actual tag values
958 // between this state and the group from the global list.
959 for (i=0; i<thisStatesTagValues->size(); i++) {
960 if (thisStatesTagValues->elementAti(i) !=
961 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
962 // Mismatch.
963 break;
964 }
965 }
966
967 if (i == thisStatesTagValues->size()) {
968 // We found a set of tag values in the global list that match
969 // those for this state. Use them.
970 sd->fTagsIdx = thisTagGroupStart;
971 break;
972 }
973 }
974
975 if (sd->fTagsIdx == -1) {
976 // No suitable entry in the global tag list already. Add one
977 sd->fTagsIdx = fRB->fRuleStatusVals->size();
978 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
979 for (i=0; i<thisStatesTagValues->size(); i++) {
980 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
981 }
982 }
983 }
984 }
985
986
987
988
989
990
991
992 //-----------------------------------------------------------------------------
993 //
994 // sortedAdd Add a value to a vector of sorted values (ints).
995 // Do not replicate entries; if the value is already there, do not
996 // add a second one.
997 // Lazily create the vector if it does not already exist.
998 //
999 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1000 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1001 int32_t i;
1002
1003 if (*vector == nullptr) {
1004 *vector = new UVector(*fStatus);
1005 }
1006 if (*vector == nullptr || U_FAILURE(*fStatus)) {
1007 return;
1008 }
1009 UVector *vec = *vector;
1010 int32_t vSize = vec->size();
1011 for (i=0; i<vSize; i++) {
1012 int32_t valAtI = vec->elementAti(i);
1013 if (valAtI == val) {
1014 // The value is already in the vector. Don't add it again.
1015 return;
1016 }
1017 if (valAtI > val) {
1018 break;
1019 }
1020 }
1021 vec->insertElementAt(val, i, *fStatus);
1022 }
1023
1024
1025
1026 //-----------------------------------------------------------------------------
1027 //
1028 // setAdd Set operation on UVector
1029 // dest = dest union source
1030 // Elements may only appear once and must be sorted.
1031 //
1032 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1033 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1034 U_ASSERT(!dest->hasDeleter());
1035 U_ASSERT(!source->hasDeleter());
1036 int32_t destOriginalSize = dest->size();
1037 int32_t sourceSize = source->size();
1038 int32_t di = 0;
1039 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
1040 void **destPtr, **sourcePtr;
1041 void **destLim, **sourceLim;
1042
1043 if (destOriginalSize > destArray.getCapacity()) {
1044 if (destArray.resize(destOriginalSize) == nullptr) {
1045 return;
1046 }
1047 }
1048 destPtr = destArray.getAlias();
1049 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
1050
1051 if (sourceSize > sourceArray.getCapacity()) {
1052 if (sourceArray.resize(sourceSize) == nullptr) {
1053 return;
1054 }
1055 }
1056 sourcePtr = sourceArray.getAlias();
1057 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1058
1059 // Avoid multiple "get element" calls by getting the contents into arrays
1060 (void) dest->toArray(destPtr);
1061 (void) source->toArray(sourcePtr);
1062
1063 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1064 if (U_FAILURE(*fStatus)) {
1065 return;
1066 }
1067
1068 while (sourcePtr < sourceLim && destPtr < destLim) {
1069 if (*destPtr == *sourcePtr) {
1070 dest->setElementAt(*sourcePtr++, di++);
1071 destPtr++;
1072 }
1073 // This check is required for machines with segmented memory, like i5/OS.
1074 // Direct pointer comparison is not recommended.
1075 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1076 dest->setElementAt(*destPtr++, di++);
1077 }
1078 else { /* *sourcePtr < *destPtr */
1079 dest->setElementAt(*sourcePtr++, di++);
1080 }
1081 }
1082
1083 // At most one of these two cleanup loops will execute
1084 while (destPtr < destLim) {
1085 dest->setElementAt(*destPtr++, di++);
1086 }
1087 while (sourcePtr < sourceLim) {
1088 dest->setElementAt(*sourcePtr++, di++);
1089 }
1090
1091 dest->setSize(di, *fStatus);
1092 }
1093
1094
1095
1096 //-----------------------------------------------------------------------------
1097 //
1098 // setEqual Set operation on UVector.
1099 // Compare for equality.
1100 // Elements must be sorted.
1101 //
1102 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1103 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1104 return a->equals(*b);
1105 }
1106
1107
1108 //-----------------------------------------------------------------------------
1109 //
1110 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1111 // for each node in the tree.
1112 //
1113 //-----------------------------------------------------------------------------
1114 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1115 void RBBITableBuilder::printPosSets(RBBINode *n) {
1116 if (n==nullptr) {
1117 return;
1118 }
1119 printf("\n");
1120 RBBINode::printNodeHeader();
1121 RBBINode::printNode(n);
1122 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"true":"false");
1123
1124 RBBIDebugPrintf(" firstpos: ");
1125 printSet(n->fFirstPosSet);
1126
1127 RBBIDebugPrintf(" lastpos: ");
1128 printSet(n->fLastPosSet);
1129
1130 RBBIDebugPrintf(" followpos: ");
1131 printSet(n->fFollowPos);
1132
1133 printPosSets(n->fLeftChild);
1134 printPosSets(n->fRightChild);
1135 }
1136 #endif
1137
1138 //
1139 // findDuplCharClassFrom()
1140 //
findDuplCharClassFrom(IntPair * categories)1141 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1142 int32_t numStates = fDStates->size();
1143 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1144
1145 for (; categories->first < numCols-1; categories->first++) {
1146 // Note: dictionary & non-dictionary columns cannot be merged.
1147 // The limitSecond value prevents considering mixed pairs.
1148 // Dictionary categories are >= DictCategoriesStart.
1149 // Non dict categories are < DictCategoriesStart.
1150 int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1151 fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1152 for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1153 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1154 uint16_t table_base = 0;
1155 uint16_t table_dupl = 1;
1156 for (int32_t state=0; state<numStates; state++) {
1157 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1158 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1159 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1160 if (table_base != table_dupl) {
1161 break;
1162 }
1163 }
1164 if (table_base == table_dupl) {
1165 return true;
1166 }
1167 }
1168 }
1169 return false;
1170 }
1171
1172
1173 //
1174 // removeColumn()
1175 //
removeColumn(int32_t column)1176 void RBBITableBuilder::removeColumn(int32_t column) {
1177 int32_t numStates = fDStates->size();
1178 for (int32_t state=0; state<numStates; state++) {
1179 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1180 U_ASSERT(column < sd->fDtran->size());
1181 sd->fDtran->removeElementAt(column);
1182 }
1183 }
1184
1185 /*
1186 * findDuplicateState
1187 */
findDuplicateState(IntPair * states)1188 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1189 int32_t numStates = fDStates->size();
1190 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1191
1192 for (; states->first<numStates-1; states->first++) {
1193 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1194 for (states->second=states->first+1; states->second<numStates; states->second++) {
1195 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1196 if (firstSD->fAccepting != duplSD->fAccepting ||
1197 firstSD->fLookAhead != duplSD->fLookAhead ||
1198 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1199 continue;
1200 }
1201 bool rowsMatch = true;
1202 for (int32_t col=0; col < numCols; ++col) {
1203 int32_t firstVal = firstSD->fDtran->elementAti(col);
1204 int32_t duplVal = duplSD->fDtran->elementAti(col);
1205 if (!((firstVal == duplVal) ||
1206 ((firstVal == states->first || firstVal == states->second) &&
1207 (duplVal == states->first || duplVal == states->second)))) {
1208 rowsMatch = false;
1209 break;
1210 }
1211 }
1212 if (rowsMatch) {
1213 return true;
1214 }
1215 }
1216 }
1217 return false;
1218 }
1219
1220
findDuplicateSafeState(IntPair * states)1221 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1222 int32_t numStates = fSafeTable->size();
1223
1224 for (; states->first<numStates-1; states->first++) {
1225 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1226 for (states->second=states->first+1; states->second<numStates; states->second++) {
1227 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1228 bool rowsMatch = true;
1229 int32_t numCols = firstRow->length();
1230 for (int32_t col=0; col < numCols; ++col) {
1231 int32_t firstVal = firstRow->charAt(col);
1232 int32_t duplVal = duplRow->charAt(col);
1233 if (!((firstVal == duplVal) ||
1234 ((firstVal == states->first || firstVal == states->second) &&
1235 (duplVal == states->first || duplVal == states->second)))) {
1236 rowsMatch = false;
1237 break;
1238 }
1239 }
1240 if (rowsMatch) {
1241 return true;
1242 }
1243 }
1244 }
1245 return false;
1246 }
1247
1248
removeState(IntPair duplStates)1249 void RBBITableBuilder::removeState(IntPair duplStates) {
1250 const int32_t keepState = duplStates.first;
1251 const int32_t duplState = duplStates.second;
1252 U_ASSERT(keepState < duplState);
1253 U_ASSERT(duplState < fDStates->size());
1254
1255 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1256 fDStates->removeElementAt(duplState);
1257 delete duplSD;
1258
1259 int32_t numStates = fDStates->size();
1260 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1261 for (int32_t state=0; state<numStates; ++state) {
1262 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1263 for (int32_t col=0; col<numCols; col++) {
1264 int32_t existingVal = sd->fDtran->elementAti(col);
1265 int32_t newVal = existingVal;
1266 if (existingVal == duplState) {
1267 newVal = keepState;
1268 } else if (existingVal > duplState) {
1269 newVal = existingVal - 1;
1270 }
1271 sd->fDtran->setElementAt(newVal, col);
1272 }
1273 }
1274 }
1275
removeSafeState(IntPair duplStates)1276 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1277 const int32_t keepState = duplStates.first;
1278 const int32_t duplState = duplStates.second;
1279 U_ASSERT(keepState < duplState);
1280 U_ASSERT(duplState < fSafeTable->size());
1281
1282 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1283 // and will auto-delete the removed element.
1284 int32_t numStates = fSafeTable->size();
1285 for (int32_t state=0; state<numStates; ++state) {
1286 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1287 int32_t numCols = sd->length();
1288 for (int32_t col=0; col<numCols; col++) {
1289 int32_t existingVal = sd->charAt(col);
1290 int32_t newVal = existingVal;
1291 if (existingVal == duplState) {
1292 newVal = keepState;
1293 } else if (existingVal > duplState) {
1294 newVal = existingVal - 1;
1295 }
1296 sd->setCharAt(col, static_cast<char16_t>(newVal));
1297 }
1298 }
1299 }
1300
1301
1302 /*
1303 * RemoveDuplicateStates
1304 */
removeDuplicateStates()1305 int32_t RBBITableBuilder::removeDuplicateStates() {
1306 IntPair dupls = {3, 0};
1307 int32_t numStatesRemoved = 0;
1308
1309 while (findDuplicateState(&dupls)) {
1310 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1311 removeState(dupls);
1312 ++numStatesRemoved;
1313 }
1314 return numStatesRemoved;
1315 }
1316
1317
1318 //-----------------------------------------------------------------------------
1319 //
1320 // getTableSize() Calculate the size of the runtime form of this
1321 // state transition table.
1322 //
1323 //-----------------------------------------------------------------------------
getTableSize() const1324 int32_t RBBITableBuilder::getTableSize() const {
1325 int32_t size = 0;
1326 int32_t numRows;
1327 int32_t numCols;
1328 int32_t rowSize;
1329
1330 if (fTree == nullptr) {
1331 return 0;
1332 }
1333
1334 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1335
1336 numRows = fDStates->size();
1337 numCols = fRB->fSetBuilder->getNumCharCategories();
1338
1339 if (use8BitsForTable()) {
1340 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1341 } else {
1342 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1343 }
1344 size += numRows * rowSize;
1345 return size;
1346 }
1347
use8BitsForTable() const1348 bool RBBITableBuilder::use8BitsForTable() const {
1349 return fDStates->size() <= kMaxStateFor8BitsTable;
1350 }
1351
1352 //-----------------------------------------------------------------------------
1353 //
1354 // exportTable() export the state transition table in the format required
1355 // by the runtime engine. getTableSize() bytes of memory
1356 // must be available at the output address "where".
1357 //
1358 //-----------------------------------------------------------------------------
exportTable(void * where)1359 void RBBITableBuilder::exportTable(void *where) {
1360 RBBIStateTable *table = (RBBIStateTable *)where;
1361 uint32_t state;
1362 int col;
1363
1364 if (U_FAILURE(*fStatus) || fTree == nullptr) {
1365 return;
1366 }
1367
1368 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1369 if (catCount > 0x7fff ||
1370 fDStates->size() > 0x7fff) {
1371 *fStatus = U_BRK_INTERNAL_ERROR;
1372 return;
1373 }
1374
1375 table->fNumStates = fDStates->size();
1376 table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1377 table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1378 table->fFlags = 0;
1379 if (use8BitsForTable()) {
1380 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1381 table->fFlags |= RBBI_8BITS_ROWS;
1382 } else {
1383 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1384 }
1385 if (fRB->fLookAheadHardBreak) {
1386 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1387 }
1388 if (fRB->fSetBuilder->sawBOF()) {
1389 table->fFlags |= RBBI_BOF_REQUIRED;
1390 }
1391
1392 for (state=0; state<table->fNumStates; state++) {
1393 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1394 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1395 if (use8BitsForTable()) {
1396 U_ASSERT (sd->fAccepting <= 255);
1397 U_ASSERT (sd->fLookAhead <= 255);
1398 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1399 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1400 r8->fAccepting = sd->fAccepting;
1401 r8->fLookAhead = sd->fLookAhead;
1402 r8->fTagsIdx = sd->fTagsIdx;
1403 for (col=0; col<catCount; col++) {
1404 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1405 r8->fNextState[col] = sd->fDtran->elementAti(col);
1406 }
1407 } else {
1408 U_ASSERT (sd->fAccepting <= 0xffff);
1409 U_ASSERT (sd->fLookAhead <= 0xffff);
1410 U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1411 row->r16.fAccepting = sd->fAccepting;
1412 row->r16.fLookAhead = sd->fLookAhead;
1413 row->r16.fTagsIdx = sd->fTagsIdx;
1414 for (col=0; col<catCount; col++) {
1415 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1416 }
1417 }
1418 }
1419 }
1420
1421
1422 /**
1423 * Synthesize a safe state table from the main state table.
1424 */
buildSafeReverseTable(UErrorCode & status)1425 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1426 // The safe table creation has three steps:
1427
1428 // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1429 // following the pair do not depend on context or state before the pair. To test
1430 // whether a pair is safe, run it through the main forward state table, starting
1431 // from each state. If the the final state is the same, no matter what the starting state,
1432 // the pair is safe.
1433 //
1434 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1435 // forward table, with a column for each input character [class], and a row for
1436 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1437 // create an additional state for each input character category; being in
1438 // one of these states means that the character has been seen, and is potentially
1439 // the first of a pair. In each of these rows, the entry for the second character
1440 // of a safe pair is set to the stop state (0), indicating that a match was found.
1441 // All other table entries are set to the state corresponding the current input
1442 // character, allowing that character to be the of a start following pair.
1443 //
1444 // Because the safe rules are to be run in reverse, moving backwards in the text,
1445 // the first and second pair categories are swapped when building the table.
1446 //
1447 // 3. Compress the table. There are typically many rows (states) that are
1448 // equivalent - that have zeroes (match completed) in the same columns -
1449 // and can be folded together.
1450
1451 // Each safe pair is stored as two UChars in the safePair string.
1452 UnicodeString safePairs;
1453
1454 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1455 int32_t numStates = fDStates->size();
1456
1457 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1458 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1459 int32_t wantedEndState = -1;
1460 int32_t endState = 0;
1461 for (int32_t startState = 1; startState < numStates; ++startState) {
1462 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1463 int32_t s2 = startStateD->fDtran->elementAti(c1);
1464 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1465 endState = s2StateD->fDtran->elementAti(c2);
1466 if (wantedEndState < 0) {
1467 wantedEndState = endState;
1468 } else {
1469 if (wantedEndState != endState) {
1470 break;
1471 }
1472 }
1473 }
1474 if (wantedEndState == endState) {
1475 safePairs.append((char16_t)c1);
1476 safePairs.append((char16_t)c2);
1477 // printf("(%d, %d) ", c1, c2);
1478 }
1479 }
1480 // printf("\n");
1481 }
1482
1483 // Populate the initial safe table.
1484 // The table as a whole is UVector<UnicodeString>
1485 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1486 // Row 0 is the stop state.
1487 // Row 1 is the start state.
1488 // Row 2 and beyond are other states, initially one per char class, but
1489 // after initial construction, many of the states will be combined, compacting the table.
1490 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1491 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1492
1493 U_ASSERT(fSafeTable == nullptr);
1494 LocalPointer<UVector> lpSafeTable(
1495 new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1496 if (U_FAILURE(status)) {
1497 return;
1498 }
1499 fSafeTable = lpSafeTable.orphan();
1500 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1501 LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1502 fSafeTable->adoptElement(lpString.orphan(), status);
1503 }
1504 if (U_FAILURE(status)) {
1505 return;
1506 }
1507
1508 // From the start state, each input char class transitions to the state for that input.
1509 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1510 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1511 // Note: +2 for the start & stop state.
1512 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1513 }
1514
1515 // Initially make every other state table row look like the start state row,
1516 for (int32_t row=2; row<numCharClasses+2; ++row) {
1517 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1518 rowState = startState; // UnicodeString assignment, copies contents.
1519 }
1520
1521 // Run through the safe pairs, set the next state to zero when pair has been seen.
1522 // Zero being the stop state, meaning we found a safe point.
1523 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1524 int32_t c1 = safePairs.charAt(pairIdx);
1525 int32_t c2 = safePairs.charAt(pairIdx + 1);
1526
1527 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1528 rowState.setCharAt(c1, 0);
1529 }
1530
1531 // Remove duplicate or redundant rows from the table.
1532 IntPair states = {1, 0};
1533 while (findDuplicateSafeState(&states)) {
1534 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1535 removeSafeState(states);
1536 }
1537 }
1538
1539
1540 //-----------------------------------------------------------------------------
1541 //
1542 // getSafeTableSize() Calculate the size of the runtime form of this
1543 // safe state table.
1544 //
1545 //-----------------------------------------------------------------------------
getSafeTableSize() const1546 int32_t RBBITableBuilder::getSafeTableSize() const {
1547 int32_t size = 0;
1548 int32_t numRows;
1549 int32_t numCols;
1550 int32_t rowSize;
1551
1552 if (fSafeTable == nullptr) {
1553 return 0;
1554 }
1555
1556 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1557
1558 numRows = fSafeTable->size();
1559 numCols = fRB->fSetBuilder->getNumCharCategories();
1560
1561 if (use8BitsForSafeTable()) {
1562 rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1563 } else {
1564 rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1565 }
1566 size += numRows * rowSize;
1567 return size;
1568 }
1569
use8BitsForSafeTable() const1570 bool RBBITableBuilder::use8BitsForSafeTable() const {
1571 return fSafeTable->size() <= kMaxStateFor8BitsTable;
1572 }
1573
1574 //-----------------------------------------------------------------------------
1575 //
1576 // exportSafeTable() export the state transition table in the format required
1577 // by the runtime engine. getTableSize() bytes of memory
1578 // must be available at the output address "where".
1579 //
1580 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1581 void RBBITableBuilder::exportSafeTable(void *where) {
1582 RBBIStateTable *table = (RBBIStateTable *)where;
1583 uint32_t state;
1584 int col;
1585
1586 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1587 return;
1588 }
1589
1590 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1591 if (catCount > 0x7fff ||
1592 fSafeTable->size() > 0x7fff) {
1593 *fStatus = U_BRK_INTERNAL_ERROR;
1594 return;
1595 }
1596
1597 table->fNumStates = fSafeTable->size();
1598 table->fFlags = 0;
1599 if (use8BitsForSafeTable()) {
1600 table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1601 table->fFlags |= RBBI_8BITS_ROWS;
1602 } else {
1603 table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1604 }
1605
1606 for (state=0; state<table->fNumStates; state++) {
1607 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1608 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1609 if (use8BitsForSafeTable()) {
1610 RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1611 r8->fAccepting = 0;
1612 r8->fLookAhead = 0;
1613 r8->fTagsIdx = 0;
1614 for (col=0; col<catCount; col++) {
1615 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1616 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1617 }
1618 } else {
1619 row->r16.fAccepting = 0;
1620 row->r16.fLookAhead = 0;
1621 row->r16.fTagsIdx = 0;
1622 for (col=0; col<catCount; col++) {
1623 row->r16.fNextState[col] = rowString->charAt(col);
1624 }
1625 }
1626 }
1627 }
1628
1629
1630
1631
1632 //-----------------------------------------------------------------------------
1633 //
1634 // printSet Debug function. Print the contents of a UVector
1635 //
1636 //-----------------------------------------------------------------------------
1637 #ifdef RBBI_DEBUG
printSet(UVector * s)1638 void RBBITableBuilder::printSet(UVector *s) {
1639 int32_t i;
1640 for (i=0; i<s->size(); i++) {
1641 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1642 RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum);
1643 }
1644 RBBIDebugPrintf("\n");
1645 }
1646 #endif
1647
1648
1649 //-----------------------------------------------------------------------------
1650 //
1651 // printStates Debug Function. Dump the fully constructed state transition table.
1652 //
1653 //-----------------------------------------------------------------------------
1654 #ifdef RBBI_DEBUG
printStates()1655 void RBBITableBuilder::printStates() {
1656 int c; // input "character"
1657 int n; // state number
1658
1659 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1660 RBBIDebugPrintf(" | Acc LA Tag");
1661 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1662 RBBIDebugPrintf(" %3d", c);
1663 }
1664 RBBIDebugPrintf("\n");
1665 RBBIDebugPrintf(" |---------------");
1666 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1667 RBBIDebugPrintf("----");
1668 }
1669 RBBIDebugPrintf("\n");
1670
1671 for (n=0; n<fDStates->size(); n++) {
1672 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1673 RBBIDebugPrintf(" %3d | " , n);
1674 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1675 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1676 RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1677 }
1678 RBBIDebugPrintf("\n");
1679 }
1680 RBBIDebugPrintf("\n\n");
1681 }
1682 #endif
1683
1684
1685 //-----------------------------------------------------------------------------
1686 //
1687 // printSafeTable Debug Function. Dump the fully constructed safe table.
1688 //
1689 //-----------------------------------------------------------------------------
1690 #ifdef RBBI_DEBUG
printReverseTable()1691 void RBBITableBuilder::printReverseTable() {
1692 int c; // input "character"
1693 int n; // state number
1694
1695 RBBIDebugPrintf(" Safe Reverse Table \n");
1696 if (fSafeTable == nullptr) {
1697 RBBIDebugPrintf(" --- nullptr ---\n");
1698 return;
1699 }
1700 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1701 RBBIDebugPrintf(" | Acc LA Tag");
1702 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1703 RBBIDebugPrintf(" %2d", c);
1704 }
1705 RBBIDebugPrintf("\n");
1706 RBBIDebugPrintf(" |---------------");
1707 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1708 RBBIDebugPrintf("---");
1709 }
1710 RBBIDebugPrintf("\n");
1711
1712 for (n=0; n<fSafeTable->size(); n++) {
1713 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1714 RBBIDebugPrintf(" %3d | " , n);
1715 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1716 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1717 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1718 }
1719 RBBIDebugPrintf("\n");
1720 }
1721 RBBIDebugPrintf("\n\n");
1722 }
1723 #endif
1724
1725
1726
1727 //-----------------------------------------------------------------------------
1728 //
1729 // printRuleStatusTable Debug Function. Dump the common rule status table
1730 //
1731 //-----------------------------------------------------------------------------
1732 #ifdef RBBI_DEBUG
printRuleStatusTable()1733 void RBBITableBuilder::printRuleStatusTable() {
1734 int32_t thisRecord = 0;
1735 int32_t nextRecord = 0;
1736 int i;
1737 UVector *tbl = fRB->fRuleStatusVals;
1738
1739 RBBIDebugPrintf("index | tags \n");
1740 RBBIDebugPrintf("-------------------\n");
1741
1742 while (nextRecord < tbl->size()) {
1743 thisRecord = nextRecord;
1744 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1745 RBBIDebugPrintf("%4d ", thisRecord);
1746 for (i=thisRecord+1; i<nextRecord; i++) {
1747 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1748 }
1749 RBBIDebugPrintf("\n");
1750 }
1751 RBBIDebugPrintf("\n\n");
1752 }
1753 #endif
1754
1755
1756 //-----------------------------------------------------------------------------
1757 //
1758 // RBBIStateDescriptor Methods. This is a very struct-like class
1759 // Most access is directly to the fields.
1760 //
1761 //-----------------------------------------------------------------------------
1762
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1763 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1764 fMarked = false;
1765 fAccepting = 0;
1766 fLookAhead = 0;
1767 fTagsIdx = 0;
1768 fTagVals = nullptr;
1769 fPositions = nullptr;
1770 fDtran = nullptr;
1771
1772 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1773 if (U_FAILURE(*fStatus)) {
1774 return;
1775 }
1776 if (fDtran == nullptr) {
1777 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1778 return;
1779 }
1780 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1781 // It is indexed by input symbols, and will
1782 // hold the next state number for each
1783 // symbol.
1784 }
1785
1786
~RBBIStateDescriptor()1787 RBBIStateDescriptor::~RBBIStateDescriptor() {
1788 delete fPositions;
1789 delete fDtran;
1790 delete fTagVals;
1791 fPositions = nullptr;
1792 fDtran = nullptr;
1793 fTagVals = nullptr;
1794 }
1795
1796 U_NAMESPACE_END
1797
1798 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1799