• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 /*
3 *******************************************************************************
4 * Copyright (C) 2003-2011, International Business Machines Corporation and    *
5 * others. All Rights Reserved.                                                *
6 *******************************************************************************
7 */
8 package android.icu.text;
9 
10 import java.io.IOException;
11 import java.io.OutputStream;
12 import java.util.ArrayList;
13 import java.util.List;
14 
15 import android.icu.impl.Assert;
16 import android.icu.impl.IntTrieBuilder;
17 
18 //
19 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
20 //                   (part of the rule building process.)
21 //
22 //      Starting with the rules parse tree from the scanner,
23 //
24 //                   -  Enumerate the set of UnicodeSets that are referenced
25 //                      by the RBBI rules.
26 //                   -  compute a set of non-overlapping character ranges
27 //                      with all characters within a range belonging to the same
28 //                      set of input uniocde sets.
29 //                   -  Derive a set of non-overlapping UnicodeSet (like things)
30 //                      that will correspond to columns in the state table for
31 //                      the RBBI execution engine.  All characters within one
32 //                      of these sets belong to the same set of the original
33 //                      UnicodeSets from the user's rules.
34 //                   -  construct the trie table that maps input characters
35 //                      to the index of the matching non-overlapping set of set from
36 //                      the previous step.
37 //
38 class RBBISetBuilder {
39     static class RangeDescriptor  {
40            int                fStartChar;      // Start of range, unicode 32 bit value.
41            int                fEndChar;        // End of range, unicode 32 bit value.
42            int                fNum;            // runtime-mapped input value for this range.
43            List<RBBINode>     fIncludesSets;    // vector of the the original
44                                                  //   Unicode sets that include this range.
45                                                 //    (Contains ptrs to uset nodes)
46             RangeDescriptor   fNext;           // Next RangeDescriptor in the linked list.
47 
RangeDescriptor()48             RangeDescriptor() {
49                 fIncludesSets = new ArrayList<RBBINode>();
50             }
51 
RangeDescriptor(RangeDescriptor other)52             RangeDescriptor(RangeDescriptor other) {
53                 fStartChar = other.fStartChar;
54                 fEndChar   = other.fEndChar;
55                 fNum       = other.fNum;
56                 fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
57             }
58 
59             //-------------------------------------------------------------------------------------
60             //
61             //          RangeDesriptor::split()
62             //
63             //-------------------------------------------------------------------------------------
split(int where)64             void split(int where) {
65                 Assert.assrt(where>fStartChar && where<=fEndChar);
66                 RangeDescriptor nr = new RangeDescriptor(this);
67 
68                 //  RangeDescriptor copy constructor copies all fields.
69                 //  Only need to update those that are different after the split.
70                 nr.fStartChar = where;
71                 this.fEndChar = where-1;
72                 nr.fNext      = this.fNext;
73                 this.fNext    = nr;
74 
75                 // TODO:  fIncludesSets is not updated.  Check it out.
76                 //         Probably because they haven't been populated yet,
77                 //         but still sloppy.
78             }
79 
80 
81             //-------------------------------------------------------------------------------------
82             //
83             //          RangeDescriptor::setDictionaryFlag
84             //
85             //          Character Category Numbers that include characters from
86             //          the original Unicode Set named "dictionary" have bit 14
87             //          set to 1.  The RBBI runtime engine uses this to trigger
88             //          use of the word dictionary.
89             //
90             //          This function looks through the Unicode Sets that it
91             //          (the range) includes, and sets the bit in fNum when
92             //          "dictionary" is among them.
93             //
94             //          TODO:  a faster way would be to find the set node for
95             //          "dictionary" just once, rather than looking it
96             //          up by name every time.
97             //
98             // -------------------------------------------------------------------------------------
setDictionaryFlag()99             void setDictionaryFlag() {
100                 int i;
101 
102                 for (i=0; i<this.fIncludesSets.size(); i++) {
103                     RBBINode        usetNode    = fIncludesSets.get(i);
104                     String          setName = "";
105                     RBBINode        setRef = usetNode.fParent;
106                     if (setRef != null) {
107                         RBBINode varRef = setRef.fParent;
108                         if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
109                             setName = varRef.fText;
110                         }
111                     }
112                     if (setName.equals("dictionary")) {
113                         this.fNum |= 0x4000;
114                         break;
115                     }
116                 }
117 
118         }
119     }
120 
121 
122     RBBIRuleBuilder       fRB;             // The RBBI Rule Compiler that owns us.
123     RangeDescriptor       fRangeList;      // Head of the linked list of RangeDescriptors
124 
125     IntTrieBuilder        fTrie;           // The mapping TRIE that is the end result of processing
126     int                  fTrieSize;        //  the Unicode Sets.
127 
128     // Groups correspond to character categories -
129     //       groups of ranges that are in the same original UnicodeSets.
130     //       fGroupCount is the index of the last used group.
131     //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
132     //       State table column 0 is not used.  Column 1 is for end-of-input.
133     //       column 2 is for group 0.  Funny counting.
134     int                fGroupCount;
135 
136     boolean             fSawBOF;
137 
138 
139     //------------------------------------------------------------------------
140     //
141     //       RBBISetBuilder Constructor
142     //
143     //------------------------------------------------------------------------
RBBISetBuilder(RBBIRuleBuilder rb)144     RBBISetBuilder(RBBIRuleBuilder rb)
145     {
146         fRB             = rb;
147     }
148 
149 
150     //------------------------------------------------------------------------
151     //
152     //           build          Build the list of non-overlapping character ranges
153     //                          from the Unicode Sets.
154     //
155     //------------------------------------------------------------------------
build()156     void build() {
157         RangeDescriptor rlRange;
158 
159         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
160 
161         //  Initialize the process by creating a single range encompassing all characters
162         //  that is in no sets.
163         //
164         fRangeList               = new RangeDescriptor();
165         fRangeList.fStartChar    = 0;
166         fRangeList.fEndChar      = 0x10ffff;
167 
168         //
169         //  Find the set of non-overlapping ranges of characters
170         //
171         for (RBBINode usetNode : fRB.fUSetNodes) {
172             UnicodeSet      inputSet             = usetNode.fInputSet;
173             int            inputSetRangeCount   = inputSet.getRangeCount();
174             int            inputSetRangeIndex   = 0;
175             rlRange              = fRangeList;
176 
177             for (;;) {
178                 if (inputSetRangeIndex >= inputSetRangeCount) {
179                     break;
180                 }
181                 int      inputSetRangeBegin  = inputSet.getRangeStart(inputSetRangeIndex);
182                 int      inputSetRangeEnd    = inputSet.getRangeEnd(inputSetRangeIndex);
183 
184                 // skip over ranges from the range list that are completely
185                 //   below the current range from the input unicode set.
186                 while (rlRange.fEndChar < inputSetRangeBegin) {
187                     rlRange = rlRange.fNext;
188                 }
189 
190                 // If the start of the range from the range list is before with
191                 //   the start of the range from the unicode set, split the range list range
192                 //   in two, with one part being before (wholly outside of) the unicode set
193                 //   and the other containing the rest.
194                 //   Then continue the loop; the post-split current range will then be skipped
195                 //     over
196                 if (rlRange.fStartChar < inputSetRangeBegin) {
197                     rlRange.split(inputSetRangeBegin);
198                      continue;
199                 }
200 
201                 // Same thing at the end of the ranges...
202                 // If the end of the range from the range list doesn't coincide with
203                 //   the end of the range from the unicode set, split the range list
204                 //   range in two.  The first part of the split range will be
205                 //   wholly inside the Unicode set.
206                 if (rlRange.fEndChar > inputSetRangeEnd) {
207                     rlRange.split(inputSetRangeEnd+1);
208                  }
209 
210                 // The current rlRange is now entirely within the UnicodeSet range.
211                 // Add this unicode set to the list of sets for this rlRange
212                 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
213                     rlRange.fIncludesSets.add(usetNode);
214                 }
215 
216                 // Advance over ranges that we are finished with.
217                 if (inputSetRangeEnd == rlRange.fEndChar) {
218                     inputSetRangeIndex++;
219                 }
220                 rlRange = rlRange.fNext;
221             }
222         }
223 
224         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();}
225 
226         //
227         //  Group the above ranges, with each group consisting of one or more
228         //    ranges that are in exactly the same set of original UnicodeSets.
229         //    The groups are numbered, and these group numbers are the set of
230         //    input symbols recognized by the run-time state machine.
231         //
232         //    Numbering: # 0  (state table column 0) is unused.
233         //               # 1  is reserved - table column 1 is for end-of-input
234         //               # 2  is reserved - table column 2 is for beginning-in-input
235         //               # 3  is the first range list.
236         //
237         RangeDescriptor rlSearchRange;
238         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
239             for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
240                 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
241                     rlRange.fNum = rlSearchRange.fNum;
242                     break;
243                 }
244             }
245             if (rlRange.fNum == 0) {
246                 fGroupCount ++;
247                 rlRange.fNum = fGroupCount+2;
248                 rlRange.setDictionaryFlag();
249                 addValToSets(rlRange.fIncludesSets, fGroupCount+2);
250             }
251         }
252 
253         // Handle input sets that contain the special string {eof}.
254         //   Column 1 of the state table is reserved for EOF on input.
255         //   Column 2 is reserved for before-the-start-input.
256         //            (This column can be optimized away later if there are no rule
257         //             references to {bof}.)
258         //   Add this column value (1 or 2) to the equivalent expression
259         //     subtree for each UnicodeSet that contains the string {eof}
260         //   Because {bof} and {eof} are not a characters in the normal sense,
261         //   they doesn't affect the computation of ranges or TRIE.
262 
263         String eofString = "eof";
264         String bofString = "bof";
265 
266         for (RBBINode usetNode : fRB.fUSetNodes) {
267             UnicodeSet      inputSet = usetNode.fInputSet;
268             if (inputSet.contains(eofString)) {
269                 addValToSet(usetNode, 1);
270             }
271             if (inputSet.contains(bofString)) {
272                 addValToSet(usetNode, 2);
273                 fSawBOF = true;
274             }
275         }
276 
277 
278         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
279         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
280 
281 
282         //IntTrieBuilder(int aliasdata[], int maxdatalength,
283         //        int initialvalue, int leadunitvalue,
284         //        boolean latin1linear)
285 
286         fTrie = new IntTrieBuilder(null,   //   Data array  (utrie will allocate one)
287                                    100000,  //   Max Data Length
288                                    0,       //   Initial value for all code points
289                                    0,       //   Lead Surrogate unit value,
290                                    true);   //   Keep Latin 1 in separately.
291 
292         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
293             fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
294         }
295     }
296 
297 
298 
299     //-----------------------------------------------------------------------------------
300     //
301     //   RBBIDataManipulate  A little internal class needed only to wrap of the
302     //                       getFoldedValue() function needed for Trie table creation.
303     //
304     //-----------------------------------------------------------------------------------
305    class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
getFoldedValue(int start, int offset)306         public int getFoldedValue(int start, int offset) {
307             int  value;
308             int  limit;
309             boolean [] inBlockZero = new boolean[1];
310 
311             limit = start + 0x400;
312             while(start<limit) {
313                 value = fTrie.getValue(start, inBlockZero);
314                 if (inBlockZero[0]) {
315                     start += IntTrieBuilder.DATA_BLOCK_LENGTH;
316                 } else if (value != 0) {
317                     return offset | 0x08000;
318                 } else {
319                     ++start;
320                 }
321             }
322             return 0;
323          }
324     }
325     RBBIDataManipulate dm = new RBBIDataManipulate();
326 
327     //-----------------------------------------------------------------------------------
328     //
329     //          getTrieSize()    Return the size that will be required to serialize the Trie.
330     //
331     //-----------------------------------------------------------------------------------
getTrieSize()332     int getTrieSize()  {
333         int size = 0;
334         try {
335             // The trie serialize function returns the size of the data written.
336             //    null output stream says give size only, don't actually write anything.
337             size = fTrie.serialize(null, true, dm );
338         } catch (IOException e) {
339             Assert.assrt (false);
340         }
341         return size;
342     }
343 
344 
345     //-----------------------------------------------------------------------------------
346     //
347     //          serializeTrie()   Write the serialized trie to an output stream
348     //
349     //-----------------------------------------------------------------------------------
serializeTrie(OutputStream os)350     void serializeTrie(OutputStream os) throws IOException {
351         fTrie.serialize(os, true, dm );
352    }
353 
354     //------------------------------------------------------------------------
355     //
356     //      addValToSets     Add a runtime-mapped input value to each uset from a
357     //      list of uset nodes. (val corresponds to a state table column.)
358     //      For each of the original Unicode sets - which correspond
359     //      directly to uset nodes - a logically equivalent expression
360     //      is constructed in terms of the remapped runtime input
361     //      symbol set.  This function adds one runtime input symbol to
362     //      a list of sets.
363     //
364     //      The "logically equivalent expression" is the tree for an
365     //      or-ing together of all of the symbols that go into the set.
366     //
367     //------------------------------------------------------------------------
addValToSets(List<RBBINode> sets, int val)368     void  addValToSets(List<RBBINode> sets, int val) {
369         for (RBBINode usetNode : sets) {
370             addValToSet(usetNode, val);
371         }
372     }
373 
addValToSet(RBBINode usetNode, int val)374     void  addValToSet(RBBINode usetNode, int val) {
375         RBBINode leafNode = new RBBINode(RBBINode.leafChar);
376         leafNode.fVal = val;
377         if (usetNode.fLeftChild == null) {
378             usetNode.fLeftChild = leafNode;
379             leafNode.fParent    = usetNode;
380         } else {
381             // There are already input symbols present for this set.
382             // Set up an OR node, with the previous stuff as the left child
383             //   and the new value as the right child.
384             RBBINode orNode = new RBBINode(RBBINode.opOr);
385             orNode.fLeftChild  = usetNode.fLeftChild;
386             orNode.fRightChild = leafNode;
387             orNode.fLeftChild.fParent  = orNode;
388             orNode.fRightChild.fParent = orNode;
389             usetNode.fLeftChild = orNode;
390             orNode.fParent = usetNode;
391         }
392     }
393 
394 
395     //------------------------------------------------------------------------
396     //
397     //           getNumCharCategories
398     //
399     //------------------------------------------------------------------------
getNumCharCategories()400     int  getNumCharCategories()  {
401         return fGroupCount + 3;
402     }
403 
404 
405     //------------------------------------------------------------------------
406     //
407     //           sawBOF
408     //
409     //------------------------------------------------------------------------
sawBOF()410     boolean  sawBOF()  {
411         return fSawBOF;
412     }
413 
414 
415     //------------------------------------------------------------------------
416     //
417     //           getFirstChar      Given a runtime RBBI character category, find
418     //                             the first UChar32 that is in the set of chars
419     //                             in the category.
420     //------------------------------------------------------------------------
getFirstChar(int category)421     int  getFirstChar(int category)  {
422         RangeDescriptor   rlRange;
423         int            retVal = -1;
424         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
425             if (rlRange.fNum == category) {
426                 retVal = rlRange.fStartChar;
427                 break;
428             }
429         }
430         return retVal;
431     }
432 
433 
434 
435     //------------------------------------------------------------------------
436     //
437     //           printRanges        A debugging function.
438     //                              dump out all of the range definitions.
439     //
440     //------------------------------------------------------------------------
441     ///CLOVER:OFF
printRanges()442     void printRanges() {
443         RangeDescriptor       rlRange;
444         int                    i;
445 
446         System.out.print("\n\n Nonoverlapping Ranges ...\n");
447         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
448             System.out.print(" " + rlRange.fNum + "   " + rlRange.fStartChar + "-" + rlRange.fEndChar);
449 
450             for (i=0; i<rlRange.fIncludesSets.size(); i++) {
451                 RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
452                 String         setName = "anon";
453                 RBBINode       setRef = usetNode.fParent;
454                 if (setRef != null) {
455                     RBBINode varRef = setRef.fParent;
456                     if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
457                         setName = varRef.fText;
458                     }
459                 }
460                 System.out.print(setName); System.out.print("  ");
461             }
462             System.out.println("");
463         }
464     }
465     ///CLOVER:ON
466 
467 
468     //------------------------------------------------------------------------
469     //
470     //           printRangeGroups     A debugging function.
471     //                                dump out all of the range groups.
472     //
473     //------------------------------------------------------------------------
474     ///CLOVER:OFF
printRangeGroups()475     void printRangeGroups() {
476         RangeDescriptor       rlRange;
477         RangeDescriptor       tRange;
478         int                    i;
479         int                    lastPrintedGroupNum = 0;
480 
481         System.out.print("\nRanges grouped by Unicode Set Membership...\n");
482         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
483             int groupNum = rlRange.fNum & 0xbfff;
484             if (groupNum > lastPrintedGroupNum) {
485                 lastPrintedGroupNum = groupNum;
486                 if (groupNum<10) {System.out.print(" ");}
487                 System.out.print(groupNum + " ");
488 
489                 if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
490 
491                 for (i=0; i<rlRange.fIncludesSets.size(); i++) {
492                     RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
493                     String         setName = "anon";
494                     RBBINode       setRef = usetNode.fParent;
495                     if (setRef != null) {
496                         RBBINode varRef = setRef.fParent;
497                         if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
498                             setName = varRef.fText;
499                         }
500                     }
501                     System.out.print(setName); System.out.print(" ");
502                 }
503 
504                 i = 0;
505                 for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
506                     if (tRange.fNum == rlRange.fNum) {
507                         if (i++ % 5 == 0) {
508                             System.out.print("\n    ");
509                         }
510                         RBBINode.printHex(tRange.fStartChar, -1);
511                         System.out.print("-");
512                         RBBINode.printHex(tRange.fEndChar, 0);
513                     }
514                 }
515                 System.out.print("\n");
516             }
517         }
518         System.out.print("\n");
519     }
520     ///CLOVER:ON
521 
522 
523     //------------------------------------------------------------------------
524     //
525     //           printSets          A debugging function.
526     //                              dump out all of the set definitions.
527     //
528     //------------------------------------------------------------------------
529     ///CLOVER:OFF
printSets()530     void printSets() {
531         int                   i;
532         System.out.print("\n\nUnicode Sets List\n------------------\n");
533         for (i=0; i<fRB.fUSetNodes.size(); i++) {
534             RBBINode        usetNode;
535             RBBINode        setRef;
536             RBBINode        varRef;
537             String          setName;
538 
539             usetNode = fRB.fUSetNodes.get(i);
540 
541             //System.out.print(" " + i + "   ");
542             RBBINode.printInt(2, i);
543             setName = "anonymous";
544             setRef = usetNode.fParent;
545             if (setRef != null) {
546                 varRef = setRef.fParent;
547                 if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
548                     setName = varRef.fText;
549                 }
550             }
551             System.out.print("  " + setName);
552             System.out.print("   ");
553             System.out.print(usetNode.fText);
554             System.out.print("\n");
555             if (usetNode.fLeftChild != null) {
556                 usetNode.fLeftChild.printTree(true);
557             }
558         }
559         System.out.print("\n");
560     }
561     ///CLOVER:ON
562 }
563