• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 
10 package com.ibm.icu.text;
11 
12 import java.util.ArrayList;
13 import java.util.Arrays;
14 import java.util.Collection;
15 import java.util.HashSet;
16 import java.util.List;
17 import java.util.Set;
18 import java.util.SortedSet;
19 import java.util.TreeSet;
20 
21 import com.ibm.icu.impl.Assert;
22 import com.ibm.icu.impl.RBBIDataWrapper;
23 import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
24 
25 /**
26  *  This class is part of the RBBI rule compiler.
27  *  It builds the state transition table used by the RBBI runtime
28  *  from the expression syntax tree generated by the rule scanner.
29  *
30  *  This class is part of the RBBI implementation only.
31  *  There is no user-visible public API here.
32  */
33 class RBBITableBuilder {
34 
35     //
36     //  RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
37     //                        one for each state.
38     static class RBBIStateDescriptor {
39         boolean      fMarked;
40         int          fAccepting;
41         int          fLookAhead;
42         SortedSet<Integer> fTagVals;
43         int          fTagsIdx;
44         Set<RBBINode> fPositions;                 // Set of parse tree positions associated
45                                                   //   with this state.  Unordered (it's a set).
46                                                   //   UVector contents are RBBINode *
47 
48         int[]        fDtran;                      // Transitions out of this state.
49                                                    //   indexed by input character
50                                                    //   contents is int index of dest state
51                                                    //   in RBBITableBuilder.fDStates
52 
RBBIStateDescriptor(int maxInputSymbol)53         RBBIStateDescriptor(int maxInputSymbol) {
54             fTagVals = new TreeSet<>();
55             fPositions = new HashSet<>();
56             fDtran = new int[maxInputSymbol+1];    // fDtran needs to be pre-sized.
57                                                     //   It is indexed by input symbols, and will
58                                                     //   hold  the next state number for each
59                                                     //   symbol.
60         }
61     }
62 
63 
64     private  RBBIRuleBuilder  fRB;
65 
66     /** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */
67     private  int  fRootIx;
68 
69     /** D states (Aho's terminology). Index is state number. */
70     private  List<RBBIStateDescriptor> fDStates;
71 
72     /** Synthesized safe table, a List of row arrays.  */
73     private List<short[]>    fSafeTable;
74 
75     private static final int MAX_STATE_FOR_8BITS_TABLE = 255;
76 
77     /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
78     int[] fLookAheadRuleMap;
79 
80     /** Counter used when assigning lookahead rule numbers.
81      * Contains the last look-ahead number already in use.
82      * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved
83      * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT.   */
84     int  fLASlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
85 
86     //-----------------------------------------------------------------------------
87     //
88     //  Constructor    for RBBITableBuilder.
89     //
90     //                 rootNode is an index into the array of root nodes that is held by
91     //                          the overall RBBIRuleBuilder.
92     //-----------------------------------------------------------------------------
RBBITableBuilder(RBBIRuleBuilder rb, int rootNodeIx)93     RBBITableBuilder(RBBIRuleBuilder rb,  int rootNodeIx)  {
94            fRootIx     = rootNodeIx;
95            fRB         = rb;
96            fDStates    = new ArrayList<>();
97         }
98 
99 
100 
101 
102        //-----------------------------------------------------------------------------
103        //
104        //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
105        //                          the DFA state transition table from the RBBI rules parse tree.
106        //
107        //-----------------------------------------------------------------------------
buildForwardTable()108        void  buildForwardTable() {
109            // If there were no rules, just return.  This situation can easily arise
110            //   for the reverse rules.
111            if (fRB.fTreeRoots[fRootIx]==null) {
112                return;
113            }
114 
115            //
116            // Walk through the tree, replacing any references to $variables with a copy of the
117            //   parse tree for the substitution expression.
118            //
119            fRB.fTreeRoots[fRootIx] = fRB.fTreeRoots[fRootIx].flattenVariables(0);
120            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("ftree")>=0) {
121                System.out.println("Parse tree after flattening variable references.");
122                fRB.fTreeRoots[fRootIx].printTree(true);
123            }
124 
125            //
126            // If the rules contained any references to {bof}
127            //   add a {bof} <cat> <former root of tree> to the
128            //   tree.  Means that all matches must start out with the
129            //   {bof} fake character.
130            //
131            if (fRB.fSetBuilder.sawBOF()) {
132                RBBINode bofTop    = new RBBINode(RBBINode.opCat);
133                RBBINode bofLeaf   = new RBBINode(RBBINode.leafChar);
134                bofTop.fLeftChild  = bofLeaf;
135                bofTop.fRightChild = fRB.fTreeRoots[fRootIx];
136                bofLeaf.fParent    = bofTop;
137                bofLeaf.fVal       = 2;      // Reserved value for {bof}.
138                fRB.fTreeRoots[fRootIx] = bofTop;
139            }
140 
141            //
142            // Add a unique right-end marker to the expression.
143            //   Appears as a cat-node, left child being the original tree,
144            //   right child being the end marker.
145            //
146            RBBINode cn = new RBBINode(RBBINode.opCat);
147            cn.fLeftChild = fRB.fTreeRoots[fRootIx];
148            fRB.fTreeRoots[fRootIx].fParent = cn;
149            RBBINode endMarkerNode = cn.fRightChild = new RBBINode(RBBINode.endMark);
150            cn.fRightChild.fParent = cn;
151            fRB.fTreeRoots[fRootIx] = cn;
152 
153            //
154            //  Replace all references to UnicodeSets with the tree for the equivalent
155            //      expression.
156            //
157            fRB.fTreeRoots[fRootIx].flattenSets();
158            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("stree")>=0) {
159                System.out.println("Parse tree after flattening Unicode Set references.");
160                fRB.fTreeRoots[fRootIx].printTree(true);
161            }
162 
163 
164            //
165            // calculate the functions nullable, firstpos, lastpos and followpos on
166            // nodes in the parse tree.
167            //    See the algorithm description in Aho.
168            //    Understanding how this works by looking at the code alone will be
169            //       nearly impossible.
170            //
171            calcNullable(fRB.fTreeRoots[fRootIx]);
172            calcFirstPos(fRB.fTreeRoots[fRootIx]);
173            calcLastPos(fRB.fTreeRoots[fRootIx]);
174            calcFollowPos(fRB.fTreeRoots[fRootIx]);
175            if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("pos")>=0) {
176                System.out.print("\n");
177                printPosSets(fRB.fTreeRoots[fRootIx]);
178            }
179 
180            //
181            //  For "chained" rules, modify the followPos sets
182            //
183            if (fRB.fChainRules) {
184                calcChainedFollowPos(fRB.fTreeRoots[fRootIx], endMarkerNode);
185            }
186 
187            //
188            //  BOF (start of input) test fixup.
189            //
190            if (fRB.fSetBuilder.sawBOF()) {
191                bofFixup();
192            }
193 
194            //
195            // Build the DFA state transition tables.
196            //
197            buildStateTable();
198            mapLookAheadRules();
199            flagAcceptingStates();
200            flagLookAheadStates();
201            flagTaggedStates();
202 
203            //
204            // Update the global table of rule status {tag} values
205            // The rule builder has a global vector of status values that are common
206            //    for all tables.  Merge the ones from this table into the global set.
207            //
208            mergeRuleStatusVals();
209        }
210 
211 
212 
213        //-----------------------------------------------------------------------------
214        //
215        //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
216        //
217        //-----------------------------------------------------------------------------
calcNullable(RBBINode n)218        void calcNullable(RBBINode n) {
219            if (n == null) {
220                return;
221            }
222            if (n.fType == RBBINode.setRef ||
223                n.fType == RBBINode.endMark ) {
224                // These are non-empty leaf node types.
225                n.fNullable = false;
226                return;
227            }
228 
229            if (n.fType == RBBINode.lookAhead || n.fType == RBBINode.tag) {
230                // Lookahead marker node.  It's a leaf, so no recursion on children.
231                // It's nullable because it does not match any literal text from the input stream.
232                n.fNullable = true;
233                return;
234            }
235 
236 
237            // The node is not a leaf.
238            //  Calculate nullable on its children.
239            calcNullable(n.fLeftChild);
240            calcNullable(n.fRightChild);
241 
242            // Apply functions from table 3.40 in Aho
243            if (n.fType == RBBINode.opOr) {
244                n.fNullable = n.fLeftChild.fNullable || n.fRightChild.fNullable;
245            }
246            else if (n.fType == RBBINode.opCat) {
247                n.fNullable = n.fLeftChild.fNullable && n.fRightChild.fNullable;
248            }
249            else if (n.fType == RBBINode.opStar || n.fType == RBBINode.opQuestion) {
250                n.fNullable = true;
251            }
252            else {
253                n.fNullable = false;
254            }
255        }
256 
257 
258 
259 
260        //-----------------------------------------------------------------------------
261        //
262        //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
263        //
264        //-----------------------------------------------------------------------------
calcFirstPos(RBBINode n)265        void calcFirstPos(RBBINode n) {
266            if (n == null) {
267                return;
268            }
269            if (n.fType == RBBINode.leafChar  ||
270                n.fType == RBBINode.endMark   ||
271                n.fType == RBBINode.lookAhead ||
272                n.fType == RBBINode.tag) {
273                // These are non-empty leaf node types.
274                n.fFirstPosSet.add(n);
275                return;
276            }
277 
278            // The node is not a leaf.
279            //  Calculate firstPos on its children.
280            calcFirstPos(n.fLeftChild);
281            calcFirstPos(n.fRightChild);
282 
283            // Apply functions from table 3.40 in Aho
284            if (n.fType == RBBINode.opOr) {
285                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
286                n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
287            }
288            else if (n.fType == RBBINode.opCat) {
289                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
290                if (n.fLeftChild.fNullable) {
291                    n.fFirstPosSet.addAll(n.fRightChild.fFirstPosSet);
292                }
293            }
294            else if (n.fType == RBBINode.opStar ||
295                     n.fType == RBBINode.opQuestion ||
296                     n.fType == RBBINode.opPlus) {
297                n.fFirstPosSet.addAll(n.fLeftChild.fFirstPosSet);
298            }
299        }
300 
301 
302 
303        //-----------------------------------------------------------------------------
304        //
305        //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
306        //
307        //-----------------------------------------------------------------------------
calcLastPos(RBBINode n)308        void calcLastPos(RBBINode n) {
309            if (n == null) {
310                return;
311            }
312            if (n.fType == RBBINode.leafChar  ||
313                n.fType == RBBINode.endMark   ||
314                n.fType == RBBINode.lookAhead ||
315                n.fType == RBBINode.tag) {
316                // These are non-empty leaf node types.
317                n.fLastPosSet.add(n);
318                return;
319            }
320 
321            // The node is not a leaf.
322            //  Calculate lastPos on its children.
323            calcLastPos(n.fLeftChild);
324            calcLastPos(n.fRightChild);
325 
326            // Apply functions from table 3.40 in Aho
327            if (n.fType == RBBINode.opOr) {
328                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
329                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
330            }
331            else if (n.fType == RBBINode.opCat) {
332                n.fLastPosSet.addAll(n.fRightChild.fLastPosSet);
333                if (n.fRightChild.fNullable) {
334                    n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
335                }
336            }
337            else if (n.fType == RBBINode.opStar     ||
338                     n.fType == RBBINode.opQuestion ||
339                     n.fType == RBBINode.opPlus) {
340                n.fLastPosSet.addAll(n.fLeftChild.fLastPosSet);
341            }
342        }
343 
344 
345 
346        //-----------------------------------------------------------------------------
347        //
348        //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
349        //
350        //-----------------------------------------------------------------------------
calcFollowPos(RBBINode n)351        void calcFollowPos(RBBINode n) {
352            if (n == null ||
353                n.fType == RBBINode.leafChar ||
354                n.fType == RBBINode.endMark) {
355                return;
356            }
357 
358            calcFollowPos(n.fLeftChild);
359            calcFollowPos(n.fRightChild);
360 
361            // Aho rule #1
362            if (n.fType == RBBINode.opCat) {
363                for (RBBINode i /* is 'i' in Aho's description */ : n.fLeftChild.fLastPosSet) {
364                    i.fFollowPos.addAll(n.fRightChild.fFirstPosSet);
365                }
366            }
367 
368            // Aho rule #2
369            if (n.fType == RBBINode.opStar ||
370                n.fType == RBBINode.opPlus) {
371                for (RBBINode i /* again, n and i are the names from Aho's description */ : n.fLastPosSet) {
372                    i.fFollowPos.addAll(n.fFirstPosSet);
373                }
374            }
375        }
376 
377        //-----------------------------------------------------------------------------
378        //
379        //           addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
380        //                               as roots of a rule to a destination vector.
381        //
382        //-----------------------------------------------------------------------------
addRuleRootNodes(List<RBBINode> dest, RBBINode node)383        void addRuleRootNodes(List<RBBINode> dest, RBBINode node) {
384            if (node == null) {
385                return;
386            }
387            if (node.fRuleRoot) {
388                dest.add(node);
389                // Note: rules cannot nest. If we found a rule start node,
390                //       no child node can also be a start node.
391                return;
392            }
393            addRuleRootNodes(dest, node.fLeftChild);
394            addRuleRootNodes(dest, node.fRightChild);
395        }
396 
397        //-----------------------------------------------------------------------------
398        //
399        //   calcChainedFollowPos.    Modify the previously calculated followPos sets
400        //                            to implement rule chaining.  NOT described by Aho
401        //
402        //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode)403        void calcChainedFollowPos(RBBINode tree, RBBINode endMarkNode) {
404 
405            List<RBBINode> leafNodes      = new ArrayList<>();
406 
407            // get a list all leaf nodes
408            tree.findNodes(leafNodes, RBBINode.leafChar);
409 
410            // Collect all leaf nodes that can start matches for rules
411            // with inbound chaining enabled, which is the union of the
412            // firstPosition sets from each of the rule root nodes.
413 
414            List<RBBINode> ruleRootNodes = new ArrayList<>();
415            addRuleRootNodes(ruleRootNodes, tree);
416 
417            Set<RBBINode> matchStartNodes = new HashSet<>();
418            for (RBBINode node: ruleRootNodes) {
419                if (node.fChainIn) {
420                    matchStartNodes.addAll(node.fFirstPosSet);
421                }
422            }
423 
424            // Iterate over all leaf nodes,
425            //
426            for (RBBINode endNode : leafNodes) {
427 
428                // Identify leaf nodes that correspond to overall rule match positions.
429                //   These include the endMarkNode in their followPos sets.
430                //
431                // Note: do not consider other end marker nodes, those that are added to
432                //       look-ahead rules. These can't chain; a match immediately stops
433                //       further matching. This leaves exactly one end marker node, the one
434                //       at the end of the complete tree.
435 
436                if (!endNode.fFollowPos.contains(endMarkNode)) {
437                    continue;
438                }
439 
440                // We've got a node that can end a match.
441 
442                // Now iterate over the nodes that can start a match, looking for ones
443                //   with the same char class as our ending node.
444                for (RBBINode startNode : matchStartNodes) {
445                    if (startNode.fType != RBBINode.leafChar) {
446                        continue;
447                    }
448 
449                    if (endNode.fVal == startNode.fVal) {
450                        // The end val (character class) of one possible match is the
451                        //   same as the start of another.
452 
453                        // Add all nodes from the followPos of the start node to the
454                        //  followPos set of the end node, which will have the effect of
455                        //  letting matches transition from a match state at endNode
456                        //  to the second char of a match starting with startNode.
457                        endNode.fFollowPos.addAll(startNode.fFollowPos);
458                    }
459                }
460            }
461        }
462 
463 
464        //-----------------------------------------------------------------------------
465        //
466        //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
467        //                Do an swizzle similar to chaining, modifying the followPos set of
468        //                the bofNode to include the followPos nodes from other {bot} nodes
469        //                scattered through the tree.
470        //
471        //                This function has much in common with calcChainedFollowPos().
472        //
473        //-----------------------------------------------------------------------------
bofFixup()474        void bofFixup() {
475            //
476            //   The parse tree looks like this ...
477            //         fTree root  --.       <cat>
478            //                               /     \
479            //                            <cat>   <#end node>
480            //                           /     \
481            //                     <bofNode>   rest
482            //                               of tree
483            //
484            //    We will be adding things to the followPos set of the <bofNode>
485            //
486            RBBINode  bofNode = fRB.fTreeRoots[fRootIx].fLeftChild.fLeftChild;
487            Assert.assrt(bofNode.fType == RBBINode.leafChar);
488            Assert.assrt(bofNode.fVal == 2);
489 
490            // Get all nodes that can be the start a match of the user-written rules
491            //  (excluding the fake bofNode)
492            //  We want the nodes that can start a match in the
493            //     part labeled "rest of tree"
494            //
495            Set<RBBINode> matchStartNodes = fRB.fTreeRoots[fRootIx].fLeftChild.fRightChild.fFirstPosSet;
496            for (RBBINode startNode : matchStartNodes) {
497                if (startNode.fType != RBBINode.leafChar) {
498                    continue;
499                }
500 
501                if (startNode.fVal == bofNode.fVal) {
502                    //  We found a leaf node corresponding to a {bof} that was
503                    //    explicitly written into a rule.
504                    //  Add everything from the followPos set of this node to the
505                    //    followPos set of the fake bofNode at the start of the tree.
506                    //
507                    bofNode.fFollowPos.addAll(startNode.fFollowPos);
508                }
509            }
510        }
511 
512        //-----------------------------------------------------------------------------
513        //
514        //   buildStateTable()    Determine the set of runtime DFA states and the
515        //                        transition tables for these states, by the algorithm
516        //                        of fig. 3.44 in Aho.
517        //
518        //                        Most of the comments are quotes of Aho's psuedo-code.
519        //
520        //-----------------------------------------------------------------------------
buildStateTable()521        void buildStateTable() {
522            //
523            // Add a dummy state 0 - the stop state.  Not from Aho.
524            int      lastInputSymbol = fRB.fSetBuilder.getNumCharCategories() - 1;
525            RBBIStateDescriptor failState = new RBBIStateDescriptor(lastInputSymbol);
526            fDStates.add(failState);
527 
528            // initially, the only unmarked state in Dstates is firstpos(root),
529            //       where toot is the root of the syntax tree for (r)#;
530            RBBIStateDescriptor initialState = new RBBIStateDescriptor(lastInputSymbol);
531            initialState.fPositions.addAll(fRB.fTreeRoots[fRootIx].fFirstPosSet);
532            fDStates.add(initialState);
533 
534            // while there is an unmarked state T in Dstates do begin
535            for (;;) {
536                RBBIStateDescriptor T = null;
537                int              tx;
538                for (tx=1; tx<fDStates.size(); tx++) {
539                    RBBIStateDescriptor temp  = fDStates.get(tx);
540                    if (temp.fMarked == false) {
541                        T = temp;
542                        break;
543                    }
544                }
545                if (T == null) {
546                    break;
547                }
548 
549                // mark T;
550                T.fMarked = true;
551 
552                // for each input symbol a do begin
553                int  a;
554                for (a = 1; a<=lastInputSymbol; a++) {
555                    // let U be the set of positions that are in followpos(p)
556                    //    for some position p in T
557                    //    such that the symbol at position p is a;
558                    Set<RBBINode> U = null;
559                    for (RBBINode p : T.fPositions) {
560                        if ((p.fType == RBBINode.leafChar) &&  (p.fVal == a)) {
561                            if (U == null) {
562                                U = new HashSet<>();
563                            }
564                            U.addAll(p.fFollowPos);
565                        }
566                    }
567 
568                    // if U is not empty and not in DStates then
569                    int  ux = 0;
570                    boolean    UinDstates = false;
571                    if (U != null) {
572                        Assert.assrt(U.size() > 0);
573                        int  ix;
574                        for (ix=0; ix<fDStates.size(); ix++) {
575                            RBBIStateDescriptor temp2;
576                            temp2 = fDStates.get(ix);
577                            if (U.equals(temp2.fPositions)) {
578                                U  = temp2.fPositions;
579                                ux = ix;
580                                UinDstates = true;
581                                break;
582                            }
583                        }
584 
585                        // Add U as an unmarked state to Dstates
586                        if (!UinDstates)
587                        {
588                            RBBIStateDescriptor newState = new RBBIStateDescriptor(lastInputSymbol);
589                            newState.fPositions = U;
590                            fDStates.add(newState);
591                            ux = fDStates.size()-1;
592                        }
593 
594                        // Dtran[T, a] := U;
595                        T.fDtran[a] = ux;
596                    }
597                }
598            }
599        }
600 
601       /**
602        * mapLookAheadRules
603        *
604        */
mapLookAheadRules()605       void mapLookAheadRules() {
606           fLookAheadRuleMap =  new int[fRB.fScanner.numRules() + 1];
607 
608           for (RBBIStateDescriptor sd: fDStates) {
609               int laSlotForState = 0;
610 
611               // Establish the look-ahead slot for this state, if the state covers
612               // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
613 
614               // If any of the look-ahead nodes already have a slot assigned, use it,
615               // otherwise assign a new one.
616 
617               boolean sawLookAheadNode = false;
618               for (RBBINode node: sd.fPositions) {
619                   if (node.fType != RBBINode.lookAhead) {
620                       continue;
621                   }
622                   sawLookAheadNode = true;
623                   int ruleNum = node.fVal;     // Set when rule was originally parsed.
624                   assert(ruleNum < fLookAheadRuleMap.length);
625                   assert(ruleNum > 0);
626                   int laSlot = fLookAheadRuleMap[ruleNum];
627                   if (laSlot != 0) {
628                       if (laSlotForState == 0) {
629                           laSlotForState = laSlot;
630                       } else {
631                           // TODO: figure out if this can fail, change to setting an error code if so.
632                           assert(laSlot == laSlotForState);
633                       }
634                   }
635               }
636               if (!sawLookAheadNode) {
637                   continue;
638               }
639 
640               if (laSlotForState == 0) {
641                   laSlotForState = ++fLASlotsInUse;
642               }
643 
644               // For each look ahead node covered by this state,
645               // set the mapping from the node's rule number to the look ahead slot.
646               // There can be multiple nodes/rule numbers going to the same la slot.
647 
648               for (RBBINode node: sd.fPositions) {
649                   if (node.fType != RBBINode.lookAhead) {
650                       continue;
651                   }
652                   int ruleNum = node.fVal;     // Set when rule was originally parsed.
653                   int existingVal = fLookAheadRuleMap[ruleNum];
654                   assert(existingVal == 0 || existingVal == laSlotForState);
655                   fLookAheadRuleMap[ruleNum] = laSlotForState;
656               }
657           }
658 
659       }
660 
661        //-----------------------------------------------------------------------------
662        //
663        //   flagAcceptingStates    Identify accepting states.
664        //                          First get a list of all of the end marker nodes.
665        //                          Then, for each state s,
666        //                              if s contains one of the end marker nodes in its list of tree positions then
667        //                                  s is an accepting state.
668        //
669        //-----------------------------------------------------------------------------
flagAcceptingStates()670        void     flagAcceptingStates() {
671            List<RBBINode> endMarkerNodes = new ArrayList<>();
672            RBBINode    endMarker;
673            int     i;
674            int     n;
675 
676            fRB.fTreeRoots[fRootIx].findNodes(endMarkerNodes, RBBINode.endMark);
677 
678            for (i=0; i<endMarkerNodes.size(); i++) {
679                endMarker = endMarkerNodes.get(i);
680                for (n=0; n<fDStates.size(); n++) {
681                    RBBIStateDescriptor sd = fDStates.get(n);
682                    if (sd.fPositions.contains(endMarker)) {
683                        // Any non-zero value for fAccepting means this is an accepting node.
684                        // The value is what will be returned to the user as the break status.
685                        // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
686 
687                        if (sd.fAccepting==0) {
688                            // State hasn't been marked as accepting yet.  Do it now.
689                            sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
690                            if (sd.fAccepting == 0) {
691                                sd.fAccepting = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL;
692                            }
693                        }
694                        if (sd.fAccepting==RBBIDataWrapper.ACCEPTING_UNCONDITIONAL && endMarker.fVal != 0) {
695                            // Both lookahead and non-lookahead accepting for this state.
696                            // Favor the look-ahead, because a look-ahead match needs to
697                            // immediately stop the run-time engine. First match, not longest.
698                            sd.fAccepting = fLookAheadRuleMap[endMarker.fVal];
699                        }
700                        // implicit else:
701                        // if sd.fAccepting already had a value other than 0 or 1, leave it be.
702                    }
703                }
704            }
705        }
706 
707 
708        //-----------------------------------------------------------------------------
709        //
710        //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
711        //
712        //-----------------------------------------------------------------------------
flagLookAheadStates()713        void     flagLookAheadStates() {
714            List<RBBINode> lookAheadNodes = new ArrayList<>();
715            RBBINode    lookAheadNode;
716            int     i;
717            int     n;
718 
719            fRB.fTreeRoots[fRootIx].findNodes(lookAheadNodes, RBBINode.lookAhead);
720            for (i=0; i<lookAheadNodes.size(); i++) {
721                lookAheadNode = lookAheadNodes.get(i);
722                for (n=0; n<fDStates.size(); n++) {
723                    RBBIStateDescriptor sd = fDStates.get(n);
724                    if (sd.fPositions.contains(lookAheadNode)) {
725                        int lookaheadSlot = fLookAheadRuleMap[lookAheadNode.fVal];
726                        assert(sd.fLookAhead == 0 || sd.fLookAhead == lookaheadSlot);
727                        sd.fLookAhead = lookaheadSlot;
728                    }
729                }
730            }
731        }
732 
733 
734 
735 
736        //-----------------------------------------------------------------------------
737        //
738        //    flagTaggedStates
739        //
740        //-----------------------------------------------------------------------------
flagTaggedStates()741        void     flagTaggedStates() {
742            List<RBBINode> tagNodes = new ArrayList<>();
743            RBBINode    tagNode;
744            int     i;
745            int     n;
746 
747            fRB.fTreeRoots[fRootIx].findNodes(tagNodes, RBBINode.tag);
748            for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
749                tagNode = tagNodes.get(i);
750 
751                for (n=0; n<fDStates.size(); n++) {              //    For each state  s (row in the state table)
752                    RBBIStateDescriptor sd = fDStates.get(n);
753                    if (sd.fPositions.contains(tagNode)) {       //       if  s include the tag node t
754                        sd.fTagVals.add(tagNode.fVal);
755                    }
756                }
757            }
758        }
759 
760 
761 
762        //-----------------------------------------------------------------------------
763        //
764        //  mergeRuleStatusVals
765        //
766        //      Allocate positions in the  global array of rule status {tag} values
767        //
768        //      The RBBI runtime uses an array of {sets of status values} that can
769        //      be returned for boundaries.  Each accepting state that has non-zero
770        //      status includes an index into this array.  The format of the array
771        //      is
772        //           Num of status values in group 1
773        //              status val
774        //              status val
775        //              ...
776        //           Num of status vals in group 2
777        //              status val
778        //              status val
779        //              ...
780        //           etc.
781        //
782        //
783        //-----------------------------------------------------------------------------
784 
mergeRuleStatusVals()785        void  mergeRuleStatusVals() {
786            //
787            //  The basic outline of what happens here is this...
788            //
789            //    for each state in this state table
790            //       if the status tag list for this state is in the global statuses list
791            //           record where and
792            //           continue with the next state
793            //       else
794            //           add the tag list for this state to the global list.
795            //
796            int n;
797 
798            // Pre-load a single tag of {0} into the table.
799            //   We will need this as a default, for rule sets with no explicit tagging,
800            //   or with explicit tagging of {0}.
801            if (fRB.fRuleStatusVals.size() == 0) {
802                fRB.fRuleStatusVals.add(1);    // Num of statuses in group
803                fRB.fRuleStatusVals.add(0);    //   and our single status of zero
804 
805                SortedSet<Integer> s0 = new TreeSet<>();        // mapping for rules with no explicit tagging
806                fRB.fStatusSets.put(s0, 0);    //   (key is an empty set).
807 
808                SortedSet<Integer> s1 = new TreeSet<>();        // mapping for rules with explicit tagging of {0}
809                s1.add(0);
810                fRB.fStatusSets.put(s1, 0);
811            }
812 
813            //    For each state, check whether the state's status tag values are
814            //       already entered into the status values array, and add them if not.
815            for (n=0; n<fDStates.size(); n++) {
816                RBBIStateDescriptor sd = fDStates.get(n);
817                Set<Integer> statusVals = sd.fTagVals;
818                Integer arrayIndexI = fRB.fStatusSets.get(statusVals);
819                if (arrayIndexI == null) {
820                    // This is the first encounter of this set of status values.
821                    //   Add them to the statusSets map, This map associates
822                    //   the set of status values with an index in the runtime status
823                    //   values array.
824                    arrayIndexI = fRB.fRuleStatusVals.size();
825                    fRB.fStatusSets.put(statusVals, arrayIndexI);
826 
827                    // Add the new set of status values to the vector of values that
828                    //   will eventually become the array used by the runtime engine.
829                    fRB.fRuleStatusVals.add(statusVals.size());
830                    fRB.fRuleStatusVals.addAll(statusVals);
831                }
832 
833                // Save the runtime array index back into the state descriptor.
834                sd.fTagsIdx = arrayIndexI.intValue();
835            }
836        }
837 
838 
839 
840 
841 
842 
843 
844        //-----------------------------------------------------------------------------
845        //
846        //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
847        //                 for each node in the tree.
848        //
849        //-----------------------------------------------------------------------------
850 
printPosSets(RBBINode n)851        void printPosSets(RBBINode n) {
852            if (n==null) {
853                return;
854            }
855            RBBINode.printNode(n);
856            System.out.print("         Nullable:  " + n.fNullable);
857 
858            System.out.print("         firstpos:  ");
859            printSet(n.fFirstPosSet);
860 
861            System.out.print("         lastpos:   ");
862            printSet(n.fLastPosSet);
863 
864            System.out.print("         followpos: ");
865            printSet(n.fFollowPos);
866 
867            printPosSets(n.fLeftChild);
868            printPosSets(n.fRightChild);
869        }
870 
871 
872 
873        /**
874         *  Find duplicate (redundant) character classes. Begin looking with categories.first.
875         *  Duplicates, if found are returned in the categories parameter.
876         *  This is an iterator-like function, used to identify character classes
877         *  (state table columns) that can be eliminated.
878         *  @param categories in/out parameter, specifies where to start looking for duplicates,
879         *                and returns the first pair of duplicates found, if any.
880         *  @return true if duplicate char classes were found, false otherwise.
881         *  @internal
882         */
findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories)883        boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
884            int numStates = fDStates.size();
885            int numCols = fRB.fSetBuilder.getNumCharCategories();
886 
887            int table_base = 0;
888            int table_dupl = 0;
889            for (; categories.first < numCols-1; ++categories.first) {
890                // Note: dictionary & non-dictionary columns cannot be merged.
891                //       The limitSecond value prevents considering mixed pairs.
892                //       Dictionary categories are >= DictCategoriesStart.
893                //       Non dict categories are   <  DictCategoriesStart.
894                int limitSecond = categories.first < fRB.fSetBuilder.getDictCategoriesStart() ?
895                    fRB.fSetBuilder.getDictCategoriesStart() : numCols;
896                for (categories.second=categories.first+1; categories.second < limitSecond; ++categories.second) {
897                    for (int state=0; state<numStates; state++) {
898                        RBBIStateDescriptor sd = fDStates.get(state);
899                        table_base = sd.fDtran[categories.first];
900                        table_dupl = sd.fDtran[categories.second];
901                        if (table_base != table_dupl) {
902                            break;
903                        }
904                    }
905                    if (table_base == table_dupl) {
906                        return true;
907                    }
908                }
909            }
910            return false;
911        }
912 
913        /**
914         * Remove a column from the state table. Used when two character categories
915         * have been found equivalent, and merged together, to eliminate the unneeded table column.
916         */
917        void removeColumn(int column) {
918            int numStates = fDStates.size();
919            for (int state=0; state<numStates; state++) {
920                RBBIStateDescriptor sd = fDStates.get(state);
921                assert(column < sd.fDtran.length);
922                int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
923                System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
924                sd.fDtran = newArray;
925            }
926        }
927 
928 
929        /**
930         *  Find duplicate (redundant) states, beginning at the specified pair,
931         *  within this state table. This is an iterator-like function, used to
932         *  identify states (state table rows) that can be eliminated.
933         *  @param states in/out parameter, specifies where to start looking for duplicates,
934         *                and returns the first pair of duplicates found, if any.
935         *  @return true if duplicate states were found, false otherwise.
936         *  @internal
937         */
938        boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
939            int numStates = fDStates.size();
940            int numCols = fRB.fSetBuilder.getNumCharCategories();
941 
942            for (; states.first<numStates-1; ++states.first) {
943                RBBIStateDescriptor firstSD = fDStates.get(states.first);
944                for (states.second=states.first+1; states.second<numStates; ++states.second) {
945                    RBBIStateDescriptor duplSD = fDStates.get(states.second);
946                    if (firstSD.fAccepting != duplSD.fAccepting ||
947                            firstSD.fLookAhead != duplSD.fLookAhead ||
948                            firstSD.fTagsIdx   != duplSD.fTagsIdx) {
949                        continue;
950                    }
951                    boolean rowsMatch = true;
952                    for (int col=0; col < numCols; ++col) {
953                        int firstVal = firstSD.fDtran[col];
954                        int duplVal = duplSD.fDtran[col];
955                        if (!((firstVal == duplVal) ||
956                                ((firstVal == states.first || firstVal == states.second) &&
957                                        (duplVal  == states.first || duplVal  == states.second)))) {
958                            rowsMatch = false;
959                            break;
960                        }
961                    }
962                    if (rowsMatch) {
963                        return true;
964                    }
965                }
966            }
967            return false;
968        }
969 
970        /**
971         *  Find the next duplicate state in the safe reverse table. An iterator function.
972         *  @param states in/out parameter, specifies where to start looking for duplicates,
973         *                and returns the first pair of duplicates found, if any.
974         *  @return true if duplicate states were found, false otherwise.
975         *  @internal
976         */
977        boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) {
978            int numStates = fSafeTable.size();
979 
980            for (; states.first<numStates-1; ++states.first) {
981                short[] firstRow = fSafeTable.get(states.first);
982                for (states.second=states.first+1; states.second<numStates; ++states.second) {
983                    short[] duplRow = fSafeTable.get(states.second);
984                    boolean rowsMatch = true;
985                    int numCols = firstRow.length;
986                    for (int col=0; col < numCols; ++col) {
987                        int firstVal = firstRow[col];
988                        int duplVal = duplRow[col];
989                        if (!((firstVal == duplVal) ||
990                                ((firstVal == states.first || firstVal == states.second) &&
991                                        (duplVal  == states.first || duplVal  == states.second)))) {
992                            rowsMatch = false;
993                            break;
994                        }
995                    }
996                    if (rowsMatch) {
997                        return true;
998                    }
999                }
1000            }
1001            return false;
1002        }
1003 
1004        /**
1005         * Remove a duplicate state (row) from the state table. All references to the deleted (second) state
1006         * are redirected to first state.
1007         * @param duplStates The duplicate pair of states.
1008         * @internal
1009         */
1010        void removeState(IntPair duplStates) {
1011            final int keepState = duplStates.first;
1012            final int duplState = duplStates.second;
1013            assert(keepState < duplState);
1014            assert(duplState < fDStates.size());
1015 
1016            fDStates.remove(duplState);
1017 
1018            int numStates = fDStates.size();
1019            int numCols = fRB.fSetBuilder.getNumCharCategories();
1020            for (int state=0; state<numStates; ++state) {
1021                RBBIStateDescriptor sd = fDStates.get(state);
1022                for (int col=0; col<numCols; col++) {
1023                    int existingVal = sd.fDtran[col];
1024                    int newVal = existingVal;
1025                    if (existingVal == duplState) {
1026                        newVal = keepState;
1027                    } else if (existingVal > duplState) {
1028                        newVal = existingVal - 1;
1029                    }
1030                    sd.fDtran[col] = newVal;
1031                }
1032            }
1033        }
1034 
1035        /**
1036         * Remove a duplicate state from the safe table.
1037         * @param duplStates The duplicate pair of states.  The first is kept, the second is removed.
1038         *                   All references to the second in the state table are retargeted
1039         *                   to the first.
1040         * @internal
1041         */
1042        void removeSafeState(IntPair duplStates) {
1043            final int keepState = duplStates.first;
1044            final int duplState = duplStates.second;
1045            assert(keepState < duplState);
1046            assert(duplState < fSafeTable.size());
1047 
1048            fSafeTable.remove(duplState);
1049            int numStates = fSafeTable.size();
1050            for (int state=0; state<numStates; ++state) {
1051                short[] row = fSafeTable.get(state);
1052                for (int col=0; col<row.length; col++) {
1053                    int existingVal = row[col];
1054                    int newVal = existingVal;
1055                    if (existingVal == duplState) {
1056                        newVal = keepState;
1057                    } else if (existingVal > duplState) {
1058                        newVal = existingVal - 1;
1059                    }
1060                    row[col] = (short)newVal;
1061                }
1062            }
1063        }
1064 
1065 
1066        /**
1067         *  Check for, and remove duplicate states (table rows).
1068         *  @return the number of states removed.
1069         *  @internal
1070         */
1071        int removeDuplicateStates() {
1072            IntPair dupls = new IntPair(3, 0);
1073            int numStatesRemoved = 0;
1074 
1075            while (findDuplicateState(dupls)) {
1076                // System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1077                removeState(dupls);
1078                ++numStatesRemoved;
1079            }
1080            return numStatesRemoved;
1081        }
1082 
1083 
1084        /**
1085         *  Calculate the size in bytes of the serialized form of this state transition table,
1086         *  which is identical to the ICU4C runtime form.
1087         *  Refer to common/rbbidata.h from ICU4C for the declarations of the structures
1088         *  being matched by this calculation.
1089         */
1090        int  getTableSize()  {
1091            if (fRB.fTreeRoots[fRootIx] == null) {
1092                return 0;
1093            }
1094            int size    = RBBIDataWrapper.RBBIStateTable.fHeaderSize;    // The header, with no rows to the table.
1095            int numRows = fDStates.size();
1096            int numCols = fRB.fSetBuilder.getNumCharCategories();
1097            boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
1098            int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
1099            size   += numRows * rowSize;
1100            size = (size + 7) & ~7;   // round up to a multiple of 8 bytes
1101            return size;
1102        }
1103 
1104 
1105 
1106        /**
1107         * Create a RBBIDataWrapper.RBBIStateTable for a newly compiled table.
1108         * RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
1109         * in common/rbbidata.h
1110         */
1111        RBBIDataWrapper.RBBIStateTable exportTable() {
1112            int                state;
1113            int                col;
1114 
1115            RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
1116            if (fRB.fTreeRoots[fRootIx] == null) {
1117                return table;
1118            }
1119 
1120            Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff &&
1121                fDStates.size() < 0x7fff);
1122            table.fNumStates = fDStates.size();
1123            table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart();
1124            table.fLookAheadResultsSize =
1125                    fLASlotsInUse == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1126            boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
1127 
1128            // Size of table size in shorts.
1129            int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories();   // Row Length in shorts.
1130            int tableSize;
1131            if (use8Bits) {
1132                tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize);       // fTable length in bytes.
1133                table.fTable = new char[tableSize];
1134                table.fRowLen = rowLen;                          // Row length in bytes.
1135            } else {
1136                tableSize = (getTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize) / 2;   // fTable length in shorts.
1137                table.fTable = new char[tableSize];
1138                table.fRowLen = rowLen * 2;                      // Row length in bytes.
1139            }
1140 
1141            if (fRB.fLookAheadHardBreak) {
1142                table.fFlags  |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK;
1143            }
1144            if (fRB.fSetBuilder.sawBOF()) {
1145                table.fFlags  |= RBBIDataWrapper.RBBI_BOF_REQUIRED;
1146            }
1147            if (use8Bits) {
1148                table.fFlags  |= RBBIDataWrapper.RBBI_8BITS_ROWS;
1149            }
1150 
1151            int numCharCategories = fRB.fSetBuilder.getNumCharCategories();
1152            for (state=0; state<table.fNumStates; state++) {
1153                RBBIStateDescriptor sd = fDStates.get(state);
1154                int row = state*rowLen;
1155                if (use8Bits) {
1156                    Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 255);
1157                    Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 255);
1158                } else {
1159                    Assert.assrt (0 <= sd.fAccepting && sd.fAccepting <= 0xffff);
1160                    Assert.assrt (0 <= sd.fLookAhead && sd.fLookAhead <= 0xffff);
1161                }
1162                table.fTable[row + RBBIDataWrapper.ACCEPTING] = (char)sd.fAccepting;
1163                table.fTable[row + RBBIDataWrapper.LOOKAHEAD] = (char)sd.fLookAhead;
1164                table.fTable[row + RBBIDataWrapper.TAGSIDX]   = (char)sd.fTagsIdx;
1165                for (col=0; col<numCharCategories; col++) {
1166                    if (use8Bits) {
1167                        Assert.assrt (0 <= sd.fDtran[col] && sd.fDtran[col] <= MAX_STATE_FOR_8BITS_TABLE);
1168                    }
1169                    table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)sd.fDtran[col];
1170                }
1171            }
1172            return table;
1173        }
1174 
1175        /**
1176         *   Synthesize a safe state table from the main state table.
1177         */
1178        void buildSafeReverseTable() {
1179            // Safe Reverse table construction is described in more detail in the corresponding
1180            // function in ICU4C, in source/common/rbbitblb.cpp. Not duplicated here because
1181            // it is too likely to get out of sync.
1182 
1183            // Each safe pair is stored as two chars in the safePair stringBuilder.
1184            StringBuilder safePairs = new StringBuilder();
1185 
1186            int numCharClasses = fRB.fSetBuilder.getNumCharCategories();
1187            int numStates = fDStates.size();
1188 
1189            for (int c1=0; c1<numCharClasses; ++c1) {
1190                for (int c2=0; c2 < numCharClasses; ++c2) {
1191                    int wantedEndState = -1;
1192                    int endState = 0;
1193                    for (int startState = 1; startState < numStates; ++startState) {
1194                        RBBIStateDescriptor startStateD = fDStates.get(startState);
1195                        int s2 = startStateD.fDtran[c1];
1196                        RBBIStateDescriptor s2StateD = fDStates.get(s2);
1197                        endState = s2StateD.fDtran[c2];
1198                        if (wantedEndState < 0) {
1199                            wantedEndState = endState;
1200                        } else {
1201                            if (wantedEndState != endState) {
1202                                break;
1203                            }
1204                        }
1205                    }
1206                    if (wantedEndState == endState) {
1207                        safePairs.append((char)c1);
1208                        safePairs.append((char)c2);
1209                        // System.out.printf("(%d, %d) ", c1, c2);
1210                    }
1211                }
1212                // System.out.printf("\n");
1213            }
1214 
1215            // Populate the initial safe table.
1216            // The table as a whole is a List<short[]>
1217            // Row 0 is the stop state.
1218            // Row 1 is the start sate.
1219            // Row 2 and beyond are other states, initially one per char class, but
1220            //   after initial construction, many of the states will be combined, compacting the table.)
1221            // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1222            // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1223 
1224            assert(fSafeTable == null);
1225            fSafeTable = new ArrayList<>();
1226            for (int row=0; row<numCharClasses + 2; ++row) {
1227                fSafeTable.add(new short[numCharClasses]);
1228            }
1229 
1230            // From the start state, each input char class transitions to the state for that input.
1231            short[] startState = fSafeTable.get(1);
1232            for (int charClass=0; charClass < numCharClasses; ++charClass) {
1233                // Note: +2 to skip the start & stop state rows.
1234                startState[charClass] = (short)(charClass+2);
1235            }
1236 
1237            // Initially make every other state table row look like the start state row
1238            //    (except for the stop state, which remains all 0)
1239            for (int row=2; row<numCharClasses+2; ++row) {
1240                System.arraycopy(startState, 0, fSafeTable.get(row), 0, startState.length);
1241            }
1242 
1243            // Run through the safe pairs, set the next state to zero when pair has been seen.
1244            // Zero being the stop state, meaning we found a safe point.
1245            for (int pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1246                int c1 = safePairs.charAt(pairIdx);
1247                int c2 = safePairs.charAt(pairIdx + 1);
1248 
1249                short[] rowState = fSafeTable.get(c2 + 2);
1250                rowState[c1] = 0;
1251            }
1252 
1253            // Remove duplicate or redundant rows from the table.
1254            RBBIRuleBuilder.IntPair states = new RBBIRuleBuilder.IntPair(1, 0);
1255            while (findDuplicateSafeState(states)) {
1256                // System.out.printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1257                removeSafeState(states);
1258            }
1259        }
1260 
1261 
1262        /**
1263         *  Calculate the size of the runtime form of this safe state table.
1264         */
1265        int getSafeTableSize() {
1266            if (fSafeTable == null) {
1267                return 0;
1268            }
1269            int size    = RBBIDataWrapper.RBBIStateTable.fHeaderSize;    // The header, with no rows to the table.
1270            int numRows = fSafeTable.size();
1271            int numCols = fSafeTable.get(0).length;
1272            boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE;
1273 
1274            int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols);
1275            size += numRows * rowSize;
1276            // TODO: there are redundant round-up. Figure out best place, get rid of the rest.
1277            size = (size + 7) & ~7;   // round up to a multiple of 8 bytes
1278            return size;
1279        }
1280 
1281 
1282        /**
1283         *  Create a RBBIDataWrapper.RBBIStateTable for the safe reverse table.
1284         *  RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
1285         *  in common/rbbidata.h
1286         */
1287        RBBIDataWrapper.RBBIStateTable exportSafeTable() {
1288            RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
1289            table.fNumStates = fSafeTable.size();
1290            boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE;
1291            int numCharCategories = fSafeTable.get(0).length;
1292 
1293            // Size of table size in shorts.
1294            int rowLen = RBBIDataWrapper.NEXTSTATES + numCharCategories;
1295            // TODO: tableSize is basically numStates * numCharCategories,
1296            //       except for alignment padding. Clean up here, and in main exportTable().
1297            int tableSize = (getSafeTableSize() - RBBIDataWrapper.RBBIStateTable.fHeaderSize);     // fTable length in bytes.
1298            if (use8Bits) {
1299                table.fFlags  |= RBBIDataWrapper.RBBI_8BITS_ROWS;
1300                table.fTable = new char[tableSize];
1301                table.fRowLen = rowLen;                          // Row length in bytes.
1302            } else {
1303                tableSize /= 2;   // fTable length in shorts.
1304                table.fTable = new char[tableSize];
1305                table.fRowLen = rowLen * 2;                      // Row length in bytes.
1306            }
1307 
1308            for (int state=0; state<table.fNumStates; state++) {
1309                short[] rowArray = fSafeTable.get(state);
1310                int row = state * rowLen;
1311 
1312                for (int col=0; col<numCharCategories; col++) {
1313                    if (use8Bits) {
1314                        Assert.assrt (rowArray[col] <= MAX_STATE_FOR_8BITS_TABLE);
1315                    }
1316                    table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = (char)rowArray[col];
1317                }
1318            }
1319            return table;
1320        }
1321 
1322 
1323        //-----------------------------------------------------------------------------
1324        //
1325        //   printSet    Debug function.   Print the contents of a set of Nodes
1326        //
1327        //-----------------------------------------------------------------------------
1328 
1329        void printSet(Collection<RBBINode> s) {
1330            for (RBBINode n : s) {
1331                RBBINode.printInt(n.fSerialNum, 8);
1332            }
1333            System.out.println();
1334        }
1335 
1336 
1337 
1338        //-----------------------------------------------------------------------------
1339        //
1340        //   printStates    Debug Function.  Dump the fully constructed state transition table.
1341        //
1342        //-----------------------------------------------------------------------------
1343 
1344        void printStates() {
1345            int     c;    // input "character"
1346            int     n;    // state number
1347 
1348            System.out.print("state |           i n p u t     s y m b o l s \n");
1349            System.out.print("      | Acc  LA    Tag");
1350            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
1351                RBBINode.printInt(c, 4);
1352            }
1353            System.out.print("\n");
1354            System.out.print("      |---------------");
1355            for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
1356                System.out.print("----");
1357            }
1358            System.out.print("\n");
1359 
1360            for (n=0; n<fDStates.size(); n++) {
1361                RBBIStateDescriptor sd = fDStates.get(n);
1362                RBBINode.printInt(n, 5);
1363                System.out.print(" | ");
1364 
1365                RBBINode.printInt(sd.fAccepting, 3);
1366                RBBINode.printInt(sd.fLookAhead, 4);
1367                RBBINode.printInt(sd.fTagsIdx, 6);
1368                System.out.print(" ");
1369                for (c=0; c<fRB.fSetBuilder.getNumCharCategories(); c++) {
1370                    RBBINode.printInt(sd.fDtran[c], 4);
1371                }
1372                System.out.print("\n");
1373            }
1374            System.out.print("\n\n");
1375        }
1376 
1377 
1378        /**
1379         * Debug Function.  Dump the fully constructed safe reverse table.
1380         */
1381        void printReverseTable() {
1382            int     c;    // input "character"
1383 
1384            System.out.printf("    Safe Reverse Table \n");
1385            if (fSafeTable == null) {
1386                System.out.printf("   --- nullptr ---\n");
1387                return;
1388            }
1389            int numCharCategories = fSafeTable.get(0).length;
1390            System.out.printf("state |           i n p u t     s y m b o l s \n");
1391            System.out.printf("      | Acc  LA    Tag");
1392            for (c=0; c< numCharCategories;  c++) {
1393                System.out.printf(" %2d", c);
1394            }
1395            System.out.printf("\n");
1396            System.out.printf("      |---------------");
1397            for (c=0; c<numCharCategories; c++) {
1398                System.out.printf("---");
1399            }
1400            System.out.printf("\n");
1401 
1402            for (int n=0; n<fSafeTable.size(); n++) {
1403                short rowArray[]  = fSafeTable.get(n);
1404                System.out.printf("  %3d | " , n);
1405                System.out.printf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1406                for (c=0; c<numCharCategories; c++) {
1407                    System.out.printf(" %2d", rowArray[c]);
1408                }
1409                System.out.printf("\n");
1410            }
1411            System.out.printf("\n\n");
1412        }
1413 
1414 
1415 
1416 
1417 
1418        //-----------------------------------------------------------------------------
1419        //
1420        //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1421        //
1422        //-----------------------------------------------------------------------------
1423 
1424        void printRuleStatusTable() {
1425            int  thisRecord = 0;
1426            int  nextRecord = 0;
1427            int      i;
1428            List<Integer> tbl = fRB.fRuleStatusVals;
1429 
1430            System.out.print("index |  tags \n");
1431            System.out.print("-------------------\n");
1432 
1433            while (nextRecord < tbl.size()) {
1434                thisRecord = nextRecord;
1435                nextRecord = thisRecord + tbl.get(thisRecord).intValue() + 1;
1436                RBBINode.printInt(thisRecord, 7);
1437                for (i=thisRecord+1; i<nextRecord; i++) {
1438                    int val = tbl.get(i).intValue();
1439                    RBBINode.printInt(val, 7);
1440                }
1441                System.out.print("\n");
1442            }
1443            System.out.print("\n\n");
1444        }
1445 
1446 
1447 
1448 }
1449