1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (c) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 // rbbitblb.cpp
11 //
12
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_BREAK_ITERATION
17
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbisetb.h"
22 #include "rbbidata.h"
23 #include "cstring.h"
24 #include "uassert.h"
25 #include "uvectr32.h"
26 #include "cmemory.h"
27
28 U_NAMESPACE_BEGIN
29
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)30 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31 fRB(rb),
32 fTree(*rootNode),
33 fStatus(&status),
34 fDStates(nullptr),
35 fSafeTable(nullptr) {
36 if (U_FAILURE(status)) {
37 return;
38 }
39 // fDStates is UVector<RBBIStateDescriptor *>
40 fDStates = new UVector(status);
41 if (U_SUCCESS(status) && fDStates == nullptr ) {
42 status = U_MEMORY_ALLOCATION_ERROR;
43 }
44 }
45
46
47
~RBBITableBuilder()48 RBBITableBuilder::~RBBITableBuilder() {
49 int i;
50 for (i=0; i<fDStates->size(); i++) {
51 delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52 }
53 delete fDStates;
54 delete fSafeTable;
55 }
56
57
58 //-----------------------------------------------------------------------------
59 //
60 // RBBITableBuilder::buildForwardTable - This is the main function for building
61 // the DFA state transition table from the RBBI rules parse tree.
62 //
63 //-----------------------------------------------------------------------------
buildForwardTable()64 void RBBITableBuilder::buildForwardTable() {
65
66 if (U_FAILURE(*fStatus)) {
67 return;
68 }
69
70 // If there were no rules, just return. This situation can easily arise
71 // for the reverse rules.
72 if (fTree==NULL) {
73 return;
74 }
75
76 //
77 // Walk through the tree, replacing any references to $variables with a copy of the
78 // parse tree for the substition expression.
79 //
80 fTree = fTree->flattenVariables();
81 #ifdef RBBI_DEBUG
82 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83 RBBIDebugPuts("\nParse tree after flattening variable references.");
84 RBBINode::printTree(fTree, TRUE);
85 }
86 #endif
87
88 //
89 // If the rules contained any references to {bof}
90 // add a {bof} <cat> <former root of tree> to the
91 // tree. Means that all matches must start out with the
92 // {bof} fake character.
93 //
94 if (fRB->fSetBuilder->sawBOF()) {
95 RBBINode *bofTop = new RBBINode(RBBINode::opCat);
96 RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
97 // Delete and exit if memory allocation failed.
98 if (bofTop == NULL || bofLeaf == NULL) {
99 *fStatus = U_MEMORY_ALLOCATION_ERROR;
100 delete bofTop;
101 delete bofLeaf;
102 return;
103 }
104 bofTop->fLeftChild = bofLeaf;
105 bofTop->fRightChild = fTree;
106 bofLeaf->fParent = bofTop;
107 bofLeaf->fVal = 2; // Reserved value for {bof}.
108 fTree = bofTop;
109 }
110
111 //
112 // Add a unique right-end marker to the expression.
113 // Appears as a cat-node, left child being the original tree,
114 // right child being the end marker.
115 //
116 RBBINode *cn = new RBBINode(RBBINode::opCat);
117 // Exit if memory allocation failed.
118 if (cn == NULL) {
119 *fStatus = U_MEMORY_ALLOCATION_ERROR;
120 return;
121 }
122 cn->fLeftChild = fTree;
123 fTree->fParent = cn;
124 cn->fRightChild = new RBBINode(RBBINode::endMark);
125 // Delete and exit if memory allocation failed.
126 if (cn->fRightChild == NULL) {
127 *fStatus = U_MEMORY_ALLOCATION_ERROR;
128 delete cn;
129 return;
130 }
131 cn->fRightChild->fParent = cn;
132 fTree = cn;
133
134 //
135 // Replace all references to UnicodeSets with the tree for the equivalent
136 // expression.
137 //
138 fTree->flattenSets();
139 #ifdef RBBI_DEBUG
140 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141 RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142 RBBINode::printTree(fTree, TRUE);
143 }
144 #endif
145
146
147 //
148 // calculate the functions nullable, firstpos, lastpos and followpos on
149 // nodes in the parse tree.
150 // See the alogrithm description in Aho.
151 // Understanding how this works by looking at the code alone will be
152 // nearly impossible.
153 //
154 calcNullable(fTree);
155 calcFirstPos(fTree);
156 calcLastPos(fTree);
157 calcFollowPos(fTree);
158 if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159 RBBIDebugPuts("\n");
160 printPosSets(fTree);
161 }
162
163 //
164 // For "chained" rules, modify the followPos sets
165 //
166 if (fRB->fChainRules) {
167 calcChainedFollowPos(fTree);
168 }
169
170 //
171 // BOF (start of input) test fixup.
172 //
173 if (fRB->fSetBuilder->sawBOF()) {
174 bofFixup();
175 }
176
177 //
178 // Build the DFA state transition tables.
179 //
180 buildStateTable();
181 flagAcceptingStates();
182 flagLookAheadStates();
183 flagTaggedStates();
184
185 //
186 // Update the global table of rule status {tag} values
187 // The rule builder has a global vector of status values that are common
188 // for all tables. Merge the ones from this table into the global set.
189 //
190 mergeRuleStatusVals();
191 }
192
193
194
195 //-----------------------------------------------------------------------------
196 //
197 // calcNullable. Impossible to explain succinctly. See Aho, section 3.9
198 //
199 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)200 void RBBITableBuilder::calcNullable(RBBINode *n) {
201 if (n == NULL) {
202 return;
203 }
204 if (n->fType == RBBINode::setRef ||
205 n->fType == RBBINode::endMark ) {
206 // These are non-empty leaf node types.
207 n->fNullable = FALSE;
208 return;
209 }
210
211 if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212 // Lookahead marker node. It's a leaf, so no recursion on children.
213 // It's nullable because it does not match any literal text from the input stream.
214 n->fNullable = TRUE;
215 return;
216 }
217
218
219 // The node is not a leaf.
220 // Calculate nullable on its children.
221 calcNullable(n->fLeftChild);
222 calcNullable(n->fRightChild);
223
224 // Apply functions from table 3.40 in Aho
225 if (n->fType == RBBINode::opOr) {
226 n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227 }
228 else if (n->fType == RBBINode::opCat) {
229 n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230 }
231 else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232 n->fNullable = TRUE;
233 }
234 else {
235 n->fNullable = FALSE;
236 }
237 }
238
239
240
241
242 //-----------------------------------------------------------------------------
243 //
244 // calcFirstPos. Impossible to explain succinctly. See Aho, section 3.9
245 //
246 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)247 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248 if (n == NULL) {
249 return;
250 }
251 if (n->fType == RBBINode::leafChar ||
252 n->fType == RBBINode::endMark ||
253 n->fType == RBBINode::lookAhead ||
254 n->fType == RBBINode::tag) {
255 // These are non-empty leaf node types.
256 // Note: In order to maintain the sort invariant on the set,
257 // this function should only be called on a node whose set is
258 // empty to start with.
259 n->fFirstPosSet->addElement(n, *fStatus);
260 return;
261 }
262
263 // The node is not a leaf.
264 // Calculate firstPos on its children.
265 calcFirstPos(n->fLeftChild);
266 calcFirstPos(n->fRightChild);
267
268 // Apply functions from table 3.40 in Aho
269 if (n->fType == RBBINode::opOr) {
270 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272 }
273 else if (n->fType == RBBINode::opCat) {
274 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275 if (n->fLeftChild->fNullable) {
276 setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277 }
278 }
279 else if (n->fType == RBBINode::opStar ||
280 n->fType == RBBINode::opQuestion ||
281 n->fType == RBBINode::opPlus) {
282 setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283 }
284 }
285
286
287
288 //-----------------------------------------------------------------------------
289 //
290 // calcLastPos. Impossible to explain succinctly. See Aho, section 3.9
291 //
292 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)293 void RBBITableBuilder::calcLastPos(RBBINode *n) {
294 if (n == NULL) {
295 return;
296 }
297 if (n->fType == RBBINode::leafChar ||
298 n->fType == RBBINode::endMark ||
299 n->fType == RBBINode::lookAhead ||
300 n->fType == RBBINode::tag) {
301 // These are non-empty leaf node types.
302 // Note: In order to maintain the sort invariant on the set,
303 // this function should only be called on a node whose set is
304 // empty to start with.
305 n->fLastPosSet->addElement(n, *fStatus);
306 return;
307 }
308
309 // The node is not a leaf.
310 // Calculate lastPos on its children.
311 calcLastPos(n->fLeftChild);
312 calcLastPos(n->fRightChild);
313
314 // Apply functions from table 3.40 in Aho
315 if (n->fType == RBBINode::opOr) {
316 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318 }
319 else if (n->fType == RBBINode::opCat) {
320 setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321 if (n->fRightChild->fNullable) {
322 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323 }
324 }
325 else if (n->fType == RBBINode::opStar ||
326 n->fType == RBBINode::opQuestion ||
327 n->fType == RBBINode::opPlus) {
328 setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329 }
330 }
331
332
333
334 //-----------------------------------------------------------------------------
335 //
336 // calcFollowPos. Impossible to explain succinctly. See Aho, section 3.9
337 //
338 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)339 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340 if (n == NULL ||
341 n->fType == RBBINode::leafChar ||
342 n->fType == RBBINode::endMark) {
343 return;
344 }
345
346 calcFollowPos(n->fLeftChild);
347 calcFollowPos(n->fRightChild);
348
349 // Aho rule #1
350 if (n->fType == RBBINode::opCat) {
351 RBBINode *i; // is 'i' in Aho's description
352 uint32_t ix;
353
354 UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356 for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357 i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358 setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359 }
360 }
361
362 // Aho rule #2
363 if (n->fType == RBBINode::opStar ||
364 n->fType == RBBINode::opPlus) {
365 RBBINode *i; // again, n and i are the names from Aho's description.
366 uint32_t ix;
367
368 for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369 i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370 setAdd(i->fFollowPos, n->fFirstPosSet);
371 }
372 }
373
374
375
376 }
377
378 //-----------------------------------------------------------------------------
379 //
380 // addRuleRootNodes Recursively walk a parse tree, adding all nodes flagged
381 // as roots of a rule to a destination vector.
382 //
383 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)384 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385 if (node == NULL || U_FAILURE(*fStatus)) {
386 return;
387 }
388 if (node->fRuleRoot) {
389 dest->addElement(node, *fStatus);
390 // Note: rules cannot nest. If we found a rule start node,
391 // no child node can also be a start node.
392 return;
393 }
394 addRuleRootNodes(dest, node->fLeftChild);
395 addRuleRootNodes(dest, node->fRightChild);
396 }
397
398 //-----------------------------------------------------------------------------
399 //
400 // calcChainedFollowPos. Modify the previously calculated followPos sets
401 // to implement rule chaining. NOT described by Aho
402 //
403 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree)404 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
406 UVector endMarkerNodes(*fStatus);
407 UVector leafNodes(*fStatus);
408 int32_t i;
409
410 if (U_FAILURE(*fStatus)) {
411 return;
412 }
413
414 // get a list of all endmarker nodes.
415 tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
417 // get a list all leaf nodes
418 tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419 if (U_FAILURE(*fStatus)) {
420 return;
421 }
422
423 // Collect all leaf nodes that can start matches for rules
424 // with inbound chaining enabled, which is the union of the
425 // firstPosition sets from each of the rule root nodes.
426
427 UVector ruleRootNodes(*fStatus);
428 addRuleRootNodes(&ruleRootNodes, tree);
429
430 UVector matchStartNodes(*fStatus);
431 for (int j=0; j<ruleRootNodes.size(); ++j) {
432 RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433 if (node->fChainIn) {
434 setAdd(&matchStartNodes, node->fFirstPosSet);
435 }
436 }
437 if (U_FAILURE(*fStatus)) {
438 return;
439 }
440
441 int32_t endNodeIx;
442 int32_t startNodeIx;
443
444 for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445 RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 RBBINode *endNode = NULL;
447
448 // Identify leaf nodes that correspond to overall rule match positions.
449 // These include an endMarkerNode in their followPos sets.
450 for (i=0; i<endMarkerNodes.size(); i++) {
451 if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452 endNode = tNode;
453 break;
454 }
455 }
456 if (endNode == NULL) {
457 // node wasn't an end node. Try again with the next.
458 continue;
459 }
460
461 // We've got a node that can end a match.
462
463 // Line Break Specific hack: If this node's val correspond to the $CM char class,
464 // don't chain from it.
465 // TODO: Add rule syntax for this behavior, get specifics out of here and
466 // into the rule file.
467 if (fRB->fLBCMNoChain) {
468 UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
469 if (c != -1) {
470 // c == -1 occurs with sets containing only the {eof} marker string.
471 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472 if (cLBProp == U_LB_COMBINING_MARK) {
473 continue;
474 }
475 }
476 }
477
478
479 // Now iterate over the nodes that can start a match, looking for ones
480 // with the same char class as our ending node.
481 RBBINode *startNode;
482 for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483 startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
484 if (startNode->fType != RBBINode::leafChar) {
485 continue;
486 }
487
488 if (endNode->fVal == startNode->fVal) {
489 // The end val (character class) of one possible match is the
490 // same as the start of another.
491
492 // Add all nodes from the followPos of the start node to the
493 // followPos set of the end node, which will have the effect of
494 // letting matches transition from a match state at endNode
495 // to the second char of a match starting with startNode.
496 setAdd(endNode->fFollowPos, startNode->fFollowPos);
497 }
498 }
499 }
500 }
501
502
503 //-----------------------------------------------------------------------------
504 //
505 // bofFixup. Fixup for state tables that include {bof} beginning of input testing.
506 // Do an swizzle similar to chaining, modifying the followPos set of
507 // the bofNode to include the followPos nodes from other {bot} nodes
508 // scattered through the tree.
509 //
510 // This function has much in common with calcChainedFollowPos().
511 //
512 //-----------------------------------------------------------------------------
bofFixup()513 void RBBITableBuilder::bofFixup() {
514
515 if (U_FAILURE(*fStatus)) {
516 return;
517 }
518
519 // The parse tree looks like this ...
520 // fTree root ---> <cat>
521 // / \ .
522 // <cat> <#end node>
523 // / \ .
524 // <bofNode> rest
525 // of tree
526 //
527 // We will be adding things to the followPos set of the <bofNode>
528 //
529 RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
530 U_ASSERT(bofNode->fType == RBBINode::leafChar);
531 U_ASSERT(bofNode->fVal == 2);
532
533 // Get all nodes that can be the start a match of the user-written rules
534 // (excluding the fake bofNode)
535 // We want the nodes that can start a match in the
536 // part labeled "rest of tree"
537 //
538 UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539
540 RBBINode *startNode;
541 int startNodeIx;
542 for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543 startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544 if (startNode->fType != RBBINode::leafChar) {
545 continue;
546 }
547
548 if (startNode->fVal == bofNode->fVal) {
549 // We found a leaf node corresponding to a {bof} that was
550 // explicitly written into a rule.
551 // Add everything from the followPos set of this node to the
552 // followPos set of the fake bofNode at the start of the tree.
553 //
554 setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555 }
556 }
557 }
558
559 //-----------------------------------------------------------------------------
560 //
561 // buildStateTable() Determine the set of runtime DFA states and the
562 // transition tables for these states, by the algorithm
563 // of fig. 3.44 in Aho.
564 //
565 // Most of the comments are quotes of Aho's psuedo-code.
566 //
567 //-----------------------------------------------------------------------------
buildStateTable()568 void RBBITableBuilder::buildStateTable() {
569 if (U_FAILURE(*fStatus)) {
570 return;
571 }
572 RBBIStateDescriptor *failState;
573 // Set it to NULL to avoid uninitialized warning
574 RBBIStateDescriptor *initialState = NULL;
575 //
576 // Add a dummy state 0 - the stop state. Not from Aho.
577 int lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
578 failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579 if (failState == NULL) {
580 *fStatus = U_MEMORY_ALLOCATION_ERROR;
581 goto ExitBuildSTdeleteall;
582 }
583 failState->fPositions = new UVector(*fStatus);
584 if (failState->fPositions == NULL) {
585 *fStatus = U_MEMORY_ALLOCATION_ERROR;
586 }
587 if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588 goto ExitBuildSTdeleteall;
589 }
590 fDStates->addElement(failState, *fStatus);
591 if (U_FAILURE(*fStatus)) {
592 goto ExitBuildSTdeleteall;
593 }
594
595 // initially, the only unmarked state in Dstates is firstpos(root),
596 // where toot is the root of the syntax tree for (r)#;
597 initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598 if (initialState == NULL) {
599 *fStatus = U_MEMORY_ALLOCATION_ERROR;
600 }
601 if (U_FAILURE(*fStatus)) {
602 goto ExitBuildSTdeleteall;
603 }
604 initialState->fPositions = new UVector(*fStatus);
605 if (initialState->fPositions == NULL) {
606 *fStatus = U_MEMORY_ALLOCATION_ERROR;
607 }
608 if (U_FAILURE(*fStatus)) {
609 goto ExitBuildSTdeleteall;
610 }
611 setAdd(initialState->fPositions, fTree->fFirstPosSet);
612 fDStates->addElement(initialState, *fStatus);
613 if (U_FAILURE(*fStatus)) {
614 goto ExitBuildSTdeleteall;
615 }
616
617 // while there is an unmarked state T in Dstates do begin
618 for (;;) {
619 RBBIStateDescriptor *T = NULL;
620 int32_t tx;
621 for (tx=1; tx<fDStates->size(); tx++) {
622 RBBIStateDescriptor *temp;
623 temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624 if (temp->fMarked == FALSE) {
625 T = temp;
626 break;
627 }
628 }
629 if (T == NULL) {
630 break;
631 }
632
633 // mark T;
634 T->fMarked = TRUE;
635
636 // for each input symbol a do begin
637 int32_t a;
638 for (a = 1; a<=lastInputSymbol; a++) {
639 // let U be the set of positions that are in followpos(p)
640 // for some position p in T
641 // such that the symbol at position p is a;
642 UVector *U = NULL;
643 RBBINode *p;
644 int32_t px;
645 for (px=0; px<T->fPositions->size(); px++) {
646 p = (RBBINode *)T->fPositions->elementAt(px);
647 if ((p->fType == RBBINode::leafChar) && (p->fVal == a)) {
648 if (U == NULL) {
649 U = new UVector(*fStatus);
650 if (U == NULL) {
651 *fStatus = U_MEMORY_ALLOCATION_ERROR;
652 goto ExitBuildSTdeleteall;
653 }
654 }
655 setAdd(U, p->fFollowPos);
656 }
657 }
658
659 // if U is not empty and not in DStates then
660 int32_t ux = 0;
661 UBool UinDstates = FALSE;
662 if (U != NULL) {
663 U_ASSERT(U->size() > 0);
664 int ix;
665 for (ix=0; ix<fDStates->size(); ix++) {
666 RBBIStateDescriptor *temp2;
667 temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668 if (setEquals(U, temp2->fPositions)) {
669 delete U;
670 U = temp2->fPositions;
671 ux = ix;
672 UinDstates = TRUE;
673 break;
674 }
675 }
676
677 // Add U as an unmarked state to Dstates
678 if (!UinDstates)
679 {
680 RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
681 if (newState == NULL) {
682 *fStatus = U_MEMORY_ALLOCATION_ERROR;
683 }
684 if (U_FAILURE(*fStatus)) {
685 goto ExitBuildSTdeleteall;
686 }
687 newState->fPositions = U;
688 fDStates->addElement(newState, *fStatus);
689 if (U_FAILURE(*fStatus)) {
690 return;
691 }
692 ux = fDStates->size()-1;
693 }
694
695 // Dtran[T, a] := U;
696 T->fDtran->setElementAt(ux, a);
697 }
698 }
699 }
700 return;
701 // delete local pointers only if error occured.
702 ExitBuildSTdeleteall:
703 delete initialState;
704 delete failState;
705 }
706
707
708
709 //-----------------------------------------------------------------------------
710 //
711 // flagAcceptingStates Identify accepting states.
712 // First get a list of all of the end marker nodes.
713 // Then, for each state s,
714 // if s contains one of the end marker nodes in its list of tree positions then
715 // s is an accepting state.
716 //
717 //-----------------------------------------------------------------------------
flagAcceptingStates()718 void RBBITableBuilder::flagAcceptingStates() {
719 if (U_FAILURE(*fStatus)) {
720 return;
721 }
722 UVector endMarkerNodes(*fStatus);
723 RBBINode *endMarker;
724 int32_t i;
725 int32_t n;
726
727 if (U_FAILURE(*fStatus)) {
728 return;
729 }
730
731 fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
732 if (U_FAILURE(*fStatus)) {
733 return;
734 }
735
736 for (i=0; i<endMarkerNodes.size(); i++) {
737 endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738 for (n=0; n<fDStates->size(); n++) {
739 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740 if (sd->fPositions->indexOf(endMarker) >= 0) {
741 // Any non-zero value for fAccepting means this is an accepting node.
742 // The value is what will be returned to the user as the break status.
743 // If no other value was specified, force it to -1.
744
745 if (sd->fAccepting==0) {
746 // State hasn't been marked as accepting yet. Do it now.
747 sd->fAccepting = endMarker->fVal;
748 if (sd->fAccepting == 0) {
749 sd->fAccepting = -1;
750 }
751 }
752 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753 // Both lookahead and non-lookahead accepting for this state.
754 // Favor the look-ahead. Expedient for line break.
755 // TODO: need a more elegant resolution for conflicting rules.
756 sd->fAccepting = endMarker->fVal;
757 }
758 // implicit else:
759 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
760
761 // If the end marker node is from a look-ahead rule, set
762 // the fLookAhead field for this state also.
763 if (endMarker->fLookAheadEnd) {
764 // TODO: don't change value if already set?
765 // TODO: allow for more than one active look-ahead rule in engine.
766 // Make value here an index to a side array in engine?
767 sd->fLookAhead = sd->fAccepting;
768 }
769 }
770 }
771 }
772 }
773
774
775 //-----------------------------------------------------------------------------
776 //
777 // flagLookAheadStates Very similar to flagAcceptingStates, above.
778 //
779 //-----------------------------------------------------------------------------
flagLookAheadStates()780 void RBBITableBuilder::flagLookAheadStates() {
781 if (U_FAILURE(*fStatus)) {
782 return;
783 }
784 UVector lookAheadNodes(*fStatus);
785 RBBINode *lookAheadNode;
786 int32_t i;
787 int32_t n;
788
789 fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
790 if (U_FAILURE(*fStatus)) {
791 return;
792 }
793 for (i=0; i<lookAheadNodes.size(); i++) {
794 lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795
796 for (n=0; n<fDStates->size(); n++) {
797 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798 if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799 sd->fLookAhead = lookAheadNode->fVal;
800 }
801 }
802 }
803 }
804
805
806
807
808 //-----------------------------------------------------------------------------
809 //
810 // flagTaggedStates
811 //
812 //-----------------------------------------------------------------------------
flagTaggedStates()813 void RBBITableBuilder::flagTaggedStates() {
814 if (U_FAILURE(*fStatus)) {
815 return;
816 }
817 UVector tagNodes(*fStatus);
818 RBBINode *tagNode;
819 int32_t i;
820 int32_t n;
821
822 if (U_FAILURE(*fStatus)) {
823 return;
824 }
825 fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
826 if (U_FAILURE(*fStatus)) {
827 return;
828 }
829 for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
830 tagNode = (RBBINode *)tagNodes.elementAt(i);
831
832 for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
833 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834 if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
835 sortedAdd(&sd->fTagVals, tagNode->fVal);
836 }
837 }
838 }
839 }
840
841
842
843
844 //-----------------------------------------------------------------------------
845 //
846 // mergeRuleStatusVals
847 //
848 // Update the global table of rule status {tag} values
849 // The rule builder has a global vector of status values that are common
850 // for all tables. Merge the ones from this table into the global set.
851 //
852 //-----------------------------------------------------------------------------
mergeRuleStatusVals()853 void RBBITableBuilder::mergeRuleStatusVals() {
854 //
855 // The basic outline of what happens here is this...
856 //
857 // for each state in this state table
858 // if the status tag list for this state is in the global statuses list
859 // record where and
860 // continue with the next state
861 // else
862 // add the tag list for this state to the global list.
863 //
864 int i;
865 int n;
866
867 // Pre-set a single tag of {0} into the table.
868 // We will need this as a default, for rule sets with no explicit tagging.
869 if (fRB->fRuleStatusVals->size() == 0) {
870 fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
871 fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
872 }
873
874 // For each state
875 for (n=0; n<fDStates->size(); n++) {
876 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877 UVector *thisStatesTagValues = sd->fTagVals;
878 if (thisStatesTagValues == NULL) {
879 // No tag values are explicitly associated with this state.
880 // Set the default tag value.
881 sd->fTagsIdx = 0;
882 continue;
883 }
884
885 // There are tag(s) associated with this state.
886 // fTagsIdx will be the index into the global tag list for this state's tag values.
887 // Initial value of -1 flags that we haven't got it set yet.
888 sd->fTagsIdx = -1;
889 int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
890 int32_t nextTagGroupStart = 0;
891
892 // Loop runs once per group of tags in the global list
893 while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894 thisTagGroupStart = nextTagGroupStart;
895 nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896 if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897 // The number of tags for this state is different from
898 // the number of tags in this group from the global list.
899 // Continue with the next group from the global list.
900 continue;
901 }
902 // The lengths match, go ahead and compare the actual tag values
903 // between this state and the group from the global list.
904 for (i=0; i<thisStatesTagValues->size(); i++) {
905 if (thisStatesTagValues->elementAti(i) !=
906 fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
907 // Mismatch.
908 break;
909 }
910 }
911
912 if (i == thisStatesTagValues->size()) {
913 // We found a set of tag values in the global list that match
914 // those for this state. Use them.
915 sd->fTagsIdx = thisTagGroupStart;
916 break;
917 }
918 }
919
920 if (sd->fTagsIdx == -1) {
921 // No suitable entry in the global tag list already. Add one
922 sd->fTagsIdx = fRB->fRuleStatusVals->size();
923 fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924 for (i=0; i<thisStatesTagValues->size(); i++) {
925 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926 }
927 }
928 }
929 }
930
931
932
933
934
935
936
937 //-----------------------------------------------------------------------------
938 //
939 // sortedAdd Add a value to a vector of sorted values (ints).
940 // Do not replicate entries; if the value is already there, do not
941 // add a second one.
942 // Lazily create the vector if it does not already exist.
943 //
944 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)945 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946 int32_t i;
947
948 if (*vector == NULL) {
949 *vector = new UVector(*fStatus);
950 }
951 if (*vector == NULL || U_FAILURE(*fStatus)) {
952 return;
953 }
954 UVector *vec = *vector;
955 int32_t vSize = vec->size();
956 for (i=0; i<vSize; i++) {
957 int32_t valAtI = vec->elementAti(i);
958 if (valAtI == val) {
959 // The value is already in the vector. Don't add it again.
960 return;
961 }
962 if (valAtI > val) {
963 break;
964 }
965 }
966 vec->insertElementAt(val, i, *fStatus);
967 }
968
969
970
971 //-----------------------------------------------------------------------------
972 //
973 // setAdd Set operation on UVector
974 // dest = dest union source
975 // Elements may only appear once and must be sorted.
976 //
977 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)978 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
979 int32_t destOriginalSize = dest->size();
980 int32_t sourceSize = source->size();
981 int32_t di = 0;
982 MaybeStackArray<void *, 16> destArray, sourceArray; // Handle small cases without malloc
983 void **destPtr, **sourcePtr;
984 void **destLim, **sourceLim;
985
986 if (destOriginalSize > destArray.getCapacity()) {
987 if (destArray.resize(destOriginalSize) == NULL) {
988 return;
989 }
990 }
991 destPtr = destArray.getAlias();
992 destLim = destPtr + destOriginalSize; // destArray.getArrayLimit()?
993
994 if (sourceSize > sourceArray.getCapacity()) {
995 if (sourceArray.resize(sourceSize) == NULL) {
996 return;
997 }
998 }
999 sourcePtr = sourceArray.getAlias();
1000 sourceLim = sourcePtr + sourceSize; // sourceArray.getArrayLimit()?
1001
1002 // Avoid multiple "get element" calls by getting the contents into arrays
1003 (void) dest->toArray(destPtr);
1004 (void) source->toArray(sourcePtr);
1005
1006 dest->setSize(sourceSize+destOriginalSize, *fStatus);
1007
1008 while (sourcePtr < sourceLim && destPtr < destLim) {
1009 if (*destPtr == *sourcePtr) {
1010 dest->setElementAt(*sourcePtr++, di++);
1011 destPtr++;
1012 }
1013 // This check is required for machines with segmented memory, like i5/OS.
1014 // Direct pointer comparison is not recommended.
1015 else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016 dest->setElementAt(*destPtr++, di++);
1017 }
1018 else { /* *sourcePtr < *destPtr */
1019 dest->setElementAt(*sourcePtr++, di++);
1020 }
1021 }
1022
1023 // At most one of these two cleanup loops will execute
1024 while (destPtr < destLim) {
1025 dest->setElementAt(*destPtr++, di++);
1026 }
1027 while (sourcePtr < sourceLim) {
1028 dest->setElementAt(*sourcePtr++, di++);
1029 }
1030
1031 dest->setSize(di, *fStatus);
1032 }
1033
1034
1035
1036 //-----------------------------------------------------------------------------
1037 //
1038 // setEqual Set operation on UVector.
1039 // Compare for equality.
1040 // Elements must be sorted.
1041 //
1042 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1043 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1044 return a->equals(*b);
1045 }
1046
1047
1048 //-----------------------------------------------------------------------------
1049 //
1050 // printPosSets Debug function. Dump Nullable, firstpos, lastpos and followpos
1051 // for each node in the tree.
1052 //
1053 //-----------------------------------------------------------------------------
1054 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1055 void RBBITableBuilder::printPosSets(RBBINode *n) {
1056 if (n==NULL) {
1057 return;
1058 }
1059 printf("\n");
1060 RBBINode::printNodeHeader();
1061 RBBINode::printNode(n);
1062 RBBIDebugPrintf(" Nullable: %s\n", n->fNullable?"TRUE":"FALSE");
1063
1064 RBBIDebugPrintf(" firstpos: ");
1065 printSet(n->fFirstPosSet);
1066
1067 RBBIDebugPrintf(" lastpos: ");
1068 printSet(n->fLastPosSet);
1069
1070 RBBIDebugPrintf(" followpos: ");
1071 printSet(n->fFollowPos);
1072
1073 printPosSets(n->fLeftChild);
1074 printPosSets(n->fRightChild);
1075 }
1076 #endif
1077
1078 //
1079 // findDuplCharClassFrom()
1080 //
findDuplCharClassFrom(IntPair * categories)1081 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082 int32_t numStates = fDStates->size();
1083 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084
1085 for (; categories->first < numCols-1; categories->first++) {
1086 for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1087 // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1088 uint16_t table_base = 0;
1089 uint16_t table_dupl = 1;
1090 for (int32_t state=0; state<numStates; state++) {
1091 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1092 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1093 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1094 if (table_base != table_dupl) {
1095 break;
1096 }
1097 }
1098 if (table_base == table_dupl) {
1099 return true;
1100 }
1101 }
1102 }
1103 return false;
1104 }
1105
1106
1107 //
1108 // removeColumn()
1109 //
removeColumn(int32_t column)1110 void RBBITableBuilder::removeColumn(int32_t column) {
1111 int32_t numStates = fDStates->size();
1112 for (int32_t state=0; state<numStates; state++) {
1113 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1114 U_ASSERT(column < sd->fDtran->size());
1115 sd->fDtran->removeElementAt(column);
1116 }
1117 }
1118
1119 /*
1120 * findDuplicateState
1121 */
findDuplicateState(IntPair * states)1122 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1123 int32_t numStates = fDStates->size();
1124 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1125
1126 for (; states->first<numStates-1; states->first++) {
1127 RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1128 for (states->second=states->first+1; states->second<numStates; states->second++) {
1129 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1130 if (firstSD->fAccepting != duplSD->fAccepting ||
1131 firstSD->fLookAhead != duplSD->fLookAhead ||
1132 firstSD->fTagsIdx != duplSD->fTagsIdx) {
1133 continue;
1134 }
1135 bool rowsMatch = true;
1136 for (int32_t col=0; col < numCols; ++col) {
1137 int32_t firstVal = firstSD->fDtran->elementAti(col);
1138 int32_t duplVal = duplSD->fDtran->elementAti(col);
1139 if (!((firstVal == duplVal) ||
1140 ((firstVal == states->first || firstVal == states->second) &&
1141 (duplVal == states->first || duplVal == states->second)))) {
1142 rowsMatch = false;
1143 break;
1144 }
1145 }
1146 if (rowsMatch) {
1147 return true;
1148 }
1149 }
1150 }
1151 return false;
1152 }
1153
1154
findDuplicateSafeState(IntPair * states)1155 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1156 int32_t numStates = fSafeTable->size();
1157
1158 for (; states->first<numStates-1; states->first++) {
1159 UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1160 for (states->second=states->first+1; states->second<numStates; states->second++) {
1161 UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1162 bool rowsMatch = true;
1163 int32_t numCols = firstRow->length();
1164 for (int32_t col=0; col < numCols; ++col) {
1165 int32_t firstVal = firstRow->charAt(col);
1166 int32_t duplVal = duplRow->charAt(col);
1167 if (!((firstVal == duplVal) ||
1168 ((firstVal == states->first || firstVal == states->second) &&
1169 (duplVal == states->first || duplVal == states->second)))) {
1170 rowsMatch = false;
1171 break;
1172 }
1173 }
1174 if (rowsMatch) {
1175 return true;
1176 }
1177 }
1178 }
1179 return false;
1180 }
1181
1182
removeState(IntPair duplStates)1183 void RBBITableBuilder::removeState(IntPair duplStates) {
1184 const int32_t keepState = duplStates.first;
1185 const int32_t duplState = duplStates.second;
1186 U_ASSERT(keepState < duplState);
1187 U_ASSERT(duplState < fDStates->size());
1188
1189 RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1190 fDStates->removeElementAt(duplState);
1191 delete duplSD;
1192
1193 int32_t numStates = fDStates->size();
1194 int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1195 for (int32_t state=0; state<numStates; ++state) {
1196 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1197 for (int32_t col=0; col<numCols; col++) {
1198 int32_t existingVal = sd->fDtran->elementAti(col);
1199 int32_t newVal = existingVal;
1200 if (existingVal == duplState) {
1201 newVal = keepState;
1202 } else if (existingVal > duplState) {
1203 newVal = existingVal - 1;
1204 }
1205 sd->fDtran->setElementAt(newVal, col);
1206 }
1207 if (sd->fAccepting == duplState) {
1208 sd->fAccepting = keepState;
1209 } else if (sd->fAccepting > duplState) {
1210 sd->fAccepting--;
1211 }
1212 if (sd->fLookAhead == duplState) {
1213 sd->fLookAhead = keepState;
1214 } else if (sd->fLookAhead > duplState) {
1215 sd->fLookAhead--;
1216 }
1217 }
1218 }
1219
removeSafeState(IntPair duplStates)1220 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1221 const int32_t keepState = duplStates.first;
1222 const int32_t duplState = duplStates.second;
1223 U_ASSERT(keepState < duplState);
1224 U_ASSERT(duplState < fSafeTable->size());
1225
1226 fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
1227 // and will auto-delete the removed element.
1228 int32_t numStates = fSafeTable->size();
1229 for (int32_t state=0; state<numStates; ++state) {
1230 UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1231 int32_t numCols = sd->length();
1232 for (int32_t col=0; col<numCols; col++) {
1233 int32_t existingVal = sd->charAt(col);
1234 int32_t newVal = existingVal;
1235 if (existingVal == duplState) {
1236 newVal = keepState;
1237 } else if (existingVal > duplState) {
1238 newVal = existingVal - 1;
1239 }
1240 sd->setCharAt(col, static_cast<char16_t>(newVal));
1241 }
1242 }
1243 }
1244
1245
1246 /*
1247 * RemoveDuplicateStates
1248 */
removeDuplicateStates()1249 int32_t RBBITableBuilder::removeDuplicateStates() {
1250 IntPair dupls = {3, 0};
1251 int32_t numStatesRemoved = 0;
1252
1253 while (findDuplicateState(&dupls)) {
1254 // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1255 removeState(dupls);
1256 ++numStatesRemoved;
1257 }
1258 return numStatesRemoved;
1259 }
1260
1261
1262 //-----------------------------------------------------------------------------
1263 //
1264 // getTableSize() Calculate the size of the runtime form of this
1265 // state transition table.
1266 //
1267 //-----------------------------------------------------------------------------
getTableSize() const1268 int32_t RBBITableBuilder::getTableSize() const {
1269 int32_t size = 0;
1270 int32_t numRows;
1271 int32_t numCols;
1272 int32_t rowSize;
1273
1274 if (fTree == NULL) {
1275 return 0;
1276 }
1277
1278 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1279
1280 numRows = fDStates->size();
1281 numCols = fRB->fSetBuilder->getNumCharCategories();
1282
1283 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1284 size += numRows * rowSize;
1285 return size;
1286 }
1287
1288
1289 //-----------------------------------------------------------------------------
1290 //
1291 // exportTable() export the state transition table in the format required
1292 // by the runtime engine. getTableSize() bytes of memory
1293 // must be available at the output address "where".
1294 //
1295 //-----------------------------------------------------------------------------
exportTable(void * where)1296 void RBBITableBuilder::exportTable(void *where) {
1297 RBBIStateTable *table = (RBBIStateTable *)where;
1298 uint32_t state;
1299 int col;
1300
1301 if (U_FAILURE(*fStatus) || fTree == NULL) {
1302 return;
1303 }
1304
1305 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1306 if (catCount > 0x7fff ||
1307 fDStates->size() > 0x7fff) {
1308 *fStatus = U_BRK_INTERNAL_ERROR;
1309 return;
1310 }
1311
1312 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1313 table->fNumStates = fDStates->size();
1314 table->fFlags = 0;
1315 if (fRB->fLookAheadHardBreak) {
1316 table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
1317 }
1318 if (fRB->fSetBuilder->sawBOF()) {
1319 table->fFlags |= RBBI_BOF_REQUIRED;
1320 }
1321 table->fReserved = 0;
1322
1323 for (state=0; state<table->fNumStates; state++) {
1324 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1325 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1326 U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1327 U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1328 row->fAccepting = (int16_t)sd->fAccepting;
1329 row->fLookAhead = (int16_t)sd->fLookAhead;
1330 row->fTagIdx = (int16_t)sd->fTagsIdx;
1331 for (col=0; col<catCount; col++) {
1332 row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1333 }
1334 }
1335 }
1336
1337
1338 /**
1339 * Synthesize a safe state table from the main state table.
1340 */
buildSafeReverseTable(UErrorCode & status)1341 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1342 // The safe table creation has three steps:
1343
1344 // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1345 // following the pair do not depend on context or state before the pair. To test
1346 // whether a pair is safe, run it through the main forward state table, starting
1347 // from each state. If the the final state is the same, no matter what the starting state,
1348 // the pair is safe.
1349 //
1350 // 2. Build a state table that recognizes the safe pairs. It's similar to their
1351 // forward table, with a column for each input character [class], and a row for
1352 // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1353 // create an additional state for each input character category; being in
1354 // one of these states means that the character has been seen, and is potentially
1355 // the first of a pair. In each of these rows, the entry for the second character
1356 // of a safe pair is set to the stop state (0), indicating that a match was found.
1357 // All other table entries are set to the state corresponding the current input
1358 // character, allowing that charcter to be the of a start following pair.
1359 //
1360 // Because the safe rules are to be run in reverse, moving backwards in the text,
1361 // the first and second pair categories are swapped when building the table.
1362 //
1363 // 3. Compress the table. There are typically many rows (states) that are
1364 // equivalent - that have zeroes (match completed) in the same columns -
1365 // and can be folded together.
1366
1367 // Each safe pair is stored as two UChars in the safePair string.
1368 UnicodeString safePairs;
1369
1370 int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1371 int32_t numStates = fDStates->size();
1372
1373 for (int32_t c1=0; c1<numCharClasses; ++c1) {
1374 for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1375 int32_t wantedEndState = -1;
1376 int32_t endState = 0;
1377 for (int32_t startState = 1; startState < numStates; ++startState) {
1378 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1379 int32_t s2 = startStateD->fDtran->elementAti(c1);
1380 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1381 endState = s2StateD->fDtran->elementAti(c2);
1382 if (wantedEndState < 0) {
1383 wantedEndState = endState;
1384 } else {
1385 if (wantedEndState != endState) {
1386 break;
1387 }
1388 }
1389 }
1390 if (wantedEndState == endState) {
1391 safePairs.append((char16_t)c1);
1392 safePairs.append((char16_t)c2);
1393 // printf("(%d, %d) ", c1, c2);
1394 }
1395 }
1396 // printf("\n");
1397 }
1398
1399 // Populate the initial safe table.
1400 // The table as a whole is UVector<UnicodeString>
1401 // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1402 // Row 0 is the stop state.
1403 // Row 1 is the start sate.
1404 // Row 2 and beyond are other states, initially one per char class, but
1405 // after initial construction, many of the states will be combined, compacting the table.
1406 // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1407 // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1408
1409 U_ASSERT(fSafeTable == nullptr);
1410 fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1411 for (int32_t row=0; row<numCharClasses + 2; ++row) {
1412 fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1413 }
1414
1415 // From the start state, each input char class transitions to the state for that input.
1416 UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1417 for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1418 // Note: +2 for the start & stop state.
1419 startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1420 }
1421
1422 // Initially make every other state table row look like the start state row,
1423 for (int32_t row=2; row<numCharClasses+2; ++row) {
1424 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1425 rowState = startState; // UnicodeString assignment, copies contents.
1426 }
1427
1428 // Run through the safe pairs, set the next state to zero when pair has been seen.
1429 // Zero being the stop state, meaning we found a safe point.
1430 for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1431 int32_t c1 = safePairs.charAt(pairIdx);
1432 int32_t c2 = safePairs.charAt(pairIdx + 1);
1433
1434 UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1435 rowState.setCharAt(c1, 0);
1436 }
1437
1438 // Remove duplicate or redundant rows from the table.
1439 IntPair states = {1, 0};
1440 while (findDuplicateSafeState(&states)) {
1441 // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1442 removeSafeState(states);
1443 }
1444 }
1445
1446
1447 //-----------------------------------------------------------------------------
1448 //
1449 // getSafeTableSize() Calculate the size of the runtime form of this
1450 // safe state table.
1451 //
1452 //-----------------------------------------------------------------------------
getSafeTableSize() const1453 int32_t RBBITableBuilder::getSafeTableSize() const {
1454 int32_t size = 0;
1455 int32_t numRows;
1456 int32_t numCols;
1457 int32_t rowSize;
1458
1459 if (fSafeTable == nullptr) {
1460 return 0;
1461 }
1462
1463 size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
1464
1465 numRows = fSafeTable->size();
1466 numCols = fRB->fSetBuilder->getNumCharCategories();
1467
1468 rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1469 size += numRows * rowSize;
1470 return size;
1471 }
1472
1473
1474 //-----------------------------------------------------------------------------
1475 //
1476 // exportSafeTable() export the state transition table in the format required
1477 // by the runtime engine. getTableSize() bytes of memory
1478 // must be available at the output address "where".
1479 //
1480 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1481 void RBBITableBuilder::exportSafeTable(void *where) {
1482 RBBIStateTable *table = (RBBIStateTable *)where;
1483 uint32_t state;
1484 int col;
1485
1486 if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1487 return;
1488 }
1489
1490 int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1491 if (catCount > 0x7fff ||
1492 fSafeTable->size() > 0x7fff) {
1493 *fStatus = U_BRK_INTERNAL_ERROR;
1494 return;
1495 }
1496
1497 table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1498 table->fNumStates = fSafeTable->size();
1499 table->fFlags = 0;
1500 table->fReserved = 0;
1501
1502 for (state=0; state<table->fNumStates; state++) {
1503 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1504 RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1505 row->fAccepting = 0;
1506 row->fLookAhead = 0;
1507 row->fTagIdx = 0;
1508 row->fReserved = 0;
1509 for (col=0; col<catCount; col++) {
1510 row->fNextState[col] = rowString->charAt(col);
1511 }
1512 }
1513 }
1514
1515
1516
1517
1518 //-----------------------------------------------------------------------------
1519 //
1520 // printSet Debug function. Print the contents of a UVector
1521 //
1522 //-----------------------------------------------------------------------------
1523 #ifdef RBBI_DEBUG
printSet(UVector * s)1524 void RBBITableBuilder::printSet(UVector *s) {
1525 int32_t i;
1526 for (i=0; i<s->size(); i++) {
1527 const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1528 RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1529 }
1530 RBBIDebugPrintf("\n");
1531 }
1532 #endif
1533
1534
1535 //-----------------------------------------------------------------------------
1536 //
1537 // printStates Debug Function. Dump the fully constructed state transition table.
1538 //
1539 //-----------------------------------------------------------------------------
1540 #ifdef RBBI_DEBUG
printStates()1541 void RBBITableBuilder::printStates() {
1542 int c; // input "character"
1543 int n; // state number
1544
1545 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1546 RBBIDebugPrintf(" | Acc LA Tag");
1547 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548 RBBIDebugPrintf(" %2d", c);
1549 }
1550 RBBIDebugPrintf("\n");
1551 RBBIDebugPrintf(" |---------------");
1552 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1553 RBBIDebugPrintf("---");
1554 }
1555 RBBIDebugPrintf("\n");
1556
1557 for (n=0; n<fDStates->size(); n++) {
1558 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1559 RBBIDebugPrintf(" %3d | " , n);
1560 RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1561 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1562 RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1563 }
1564 RBBIDebugPrintf("\n");
1565 }
1566 RBBIDebugPrintf("\n\n");
1567 }
1568 #endif
1569
1570
1571 //-----------------------------------------------------------------------------
1572 //
1573 // printSafeTable Debug Function. Dump the fully constructed safe table.
1574 //
1575 //-----------------------------------------------------------------------------
1576 #ifdef RBBI_DEBUG
printReverseTable()1577 void RBBITableBuilder::printReverseTable() {
1578 int c; // input "character"
1579 int n; // state number
1580
1581 RBBIDebugPrintf(" Safe Reverse Table \n");
1582 if (fSafeTable == nullptr) {
1583 RBBIDebugPrintf(" --- nullptr ---\n");
1584 return;
1585 }
1586 RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
1587 RBBIDebugPrintf(" | Acc LA Tag");
1588 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589 RBBIDebugPrintf(" %2d", c);
1590 }
1591 RBBIDebugPrintf("\n");
1592 RBBIDebugPrintf(" |---------------");
1593 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1594 RBBIDebugPrintf("---");
1595 }
1596 RBBIDebugPrintf("\n");
1597
1598 for (n=0; n<fSafeTable->size(); n++) {
1599 UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1600 RBBIDebugPrintf(" %3d | " , n);
1601 RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
1602 for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1603 RBBIDebugPrintf(" %2d", rowString->charAt(c));
1604 }
1605 RBBIDebugPrintf("\n");
1606 }
1607 RBBIDebugPrintf("\n\n");
1608 }
1609 #endif
1610
1611
1612
1613 //-----------------------------------------------------------------------------
1614 //
1615 // printRuleStatusTable Debug Function. Dump the common rule status table
1616 //
1617 //-----------------------------------------------------------------------------
1618 #ifdef RBBI_DEBUG
printRuleStatusTable()1619 void RBBITableBuilder::printRuleStatusTable() {
1620 int32_t thisRecord = 0;
1621 int32_t nextRecord = 0;
1622 int i;
1623 UVector *tbl = fRB->fRuleStatusVals;
1624
1625 RBBIDebugPrintf("index | tags \n");
1626 RBBIDebugPrintf("-------------------\n");
1627
1628 while (nextRecord < tbl->size()) {
1629 thisRecord = nextRecord;
1630 nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1631 RBBIDebugPrintf("%4d ", thisRecord);
1632 for (i=thisRecord+1; i<nextRecord; i++) {
1633 RBBIDebugPrintf(" %5d", tbl->elementAti(i));
1634 }
1635 RBBIDebugPrintf("\n");
1636 }
1637 RBBIDebugPrintf("\n\n");
1638 }
1639 #endif
1640
1641
1642 //-----------------------------------------------------------------------------
1643 //
1644 // RBBIStateDescriptor Methods. This is a very struct-like class
1645 // Most access is directly to the fields.
1646 //
1647 //-----------------------------------------------------------------------------
1648
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1649 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1650 fMarked = FALSE;
1651 fAccepting = 0;
1652 fLookAhead = 0;
1653 fTagsIdx = 0;
1654 fTagVals = NULL;
1655 fPositions = NULL;
1656 fDtran = NULL;
1657
1658 fDtran = new UVector32(lastInputSymbol+1, *fStatus);
1659 if (U_FAILURE(*fStatus)) {
1660 return;
1661 }
1662 if (fDtran == NULL) {
1663 *fStatus = U_MEMORY_ALLOCATION_ERROR;
1664 return;
1665 }
1666 fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
1667 // It is indexed by input symbols, and will
1668 // hold the next state number for each
1669 // symbol.
1670 }
1671
1672
~RBBIStateDescriptor()1673 RBBIStateDescriptor::~RBBIStateDescriptor() {
1674 delete fPositions;
1675 delete fDtran;
1676 delete fTagVals;
1677 fPositions = NULL;
1678 fDtran = NULL;
1679 fTagVals = NULL;
1680 }
1681
1682 U_NAMESPACE_END
1683
1684 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1685