• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2003-2011, International Business Machines Corporation and    *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9 package com.ibm.icu.text;
10 
11 import java.io.ByteArrayOutputStream;
12 import java.io.IOException;
13 import java.io.OutputStream;
14 import java.util.ArrayList;
15 import java.util.List;
16 
17 import com.ibm.icu.impl.Assert;
18 import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
19 import com.ibm.icu.util.CodePointTrie;
20 import com.ibm.icu.util.MutableCodePointTrie;
21 
22 //
23 //  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
24 //                   (part of the rule building process.)
25 //
26 //      Starting with the rules parse tree from the scanner,
27 //
28 //                   -  Enumerate the set of UnicodeSets that are referenced
29 //                      by the RBBI rules.
30 //                   -  compute a set of non-overlapping character ranges
31 //                      with all characters within a range belonging to the same
32 //                      set of input unicode sets.
33 //                   -  Derive a set of non-overlapping UnicodeSet (like things)
34 //                      that will correspond to columns in the state table for
35 //                      the RBBI execution engine.  All characters within one
36 //                      of these sets belong to the same set of the original
37 //                      UnicodeSets from the user's rules.
38 //                   -  construct the trie table that maps input characters
39 //                      to the index of the matching non-overlapping set of set from
40 //                      the previous step.
41 //
42 class RBBISetBuilder {
43     static class RangeDescriptor  {
44            int                fStartChar = 0;         // Start of range, unicode 32 bit value.
45            int                fEndChar = 0;           // End of range, unicode 32 bit value.
46            int                fNum = 0;               // runtime-mapped input value for this range.
47            boolean            fIncludesDict = false;  // True if the range includes $dictionary.
48            boolean            fFirstInGroup = false;  // True if first range in a group with the same fNum.
49            List<RBBINode>     fIncludesSets;          // vector of the the original
50                                                       //   Unicode sets that include this range.
51                                                       //    (Contains ptrs to uset nodes)
52             RangeDescriptor   fNext;                  // Next RangeDescriptor in the linked list.
53 
RangeDescriptor()54             RangeDescriptor() {
55                 fIncludesSets = new ArrayList<>();
56             }
57 
RangeDescriptor(RangeDescriptor other)58             RangeDescriptor(RangeDescriptor other) {
59                 fStartChar = other.fStartChar;
60                 fEndChar   = other.fEndChar;
61                 fNum       = other.fNum;
62                 fIncludesDict = other.fIncludesDict;
63                 fFirstInGroup = other.fFirstInGroup;
64                 fIncludesSets = new ArrayList<>(other.fIncludesSets);
65             }
66 
67             //-------------------------------------------------------------------------------------
68             //
69             //          RangeDesriptor::split()
70             //
71             //-------------------------------------------------------------------------------------
split(int where)72             void split(int where) {
73                 Assert.assrt(where>fStartChar && where<=fEndChar);
74                 RangeDescriptor nr = new RangeDescriptor(this);
75 
76                 //  RangeDescriptor copy constructor copies all fields.
77                 //  Only need to update those that are different after the split.
78                 nr.fStartChar = where;
79                 this.fEndChar = where-1;
80                 nr.fNext      = this.fNext;
81                 this.fNext    = nr;
82 
83                 // TODO:  fIncludesSets is not updated.  Check it out.
84                 //         Probably because they haven't been populated yet,
85                 //         but still sloppy.
86             }
87 
88 
89             /**
90              * Test whether this range includes characters from the original Unicode Set named "dictionary".
91              *
92              * This function looks through the Unicode Sets that
93              * the range includes, checking for one named "dictionary"
94              */
95             //          TODO:  a faster way would be to find the set node for
96             //          "dictionary" just once, rather than looking it
97             //          up by name every time.
98             //
isDictionaryRange()99             boolean isDictionaryRange() {
100                 for (int i=0; i<this.fIncludesSets.size(); i++) {
101                     RBBINode        usetNode    = fIncludesSets.get(i);
102                     String          setName = "";
103                     RBBINode        setRef = usetNode.fParent;
104                     if (setRef != null) {
105                         RBBINode varRef = setRef.fParent;
106                         if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
107                             setName = varRef.fText;
108                         }
109                     }
110                     if (setName.equals("dictionary")) {
111                         return true;
112                     }
113                 }
114                 return false;
115         }
116     }
117 
118 
119     RBBIRuleBuilder       fRB;             // The RBBI Rule Compiler that owns us.
120     RangeDescriptor       fRangeList;      // Head of the linked list of RangeDescriptors
121 
122     MutableCodePointTrie  fTrie;           // The mapping TRIE that is the end result of processing
123                                            //  the Unicode Sets.
124     CodePointTrie         fFrozenTrie;
125 
126     /**
127      * Number of range groups, which are groups of ranges that are in the same original UnicodeSets.
128      */
129     int                fGroupCount;
130     /**
131      * The number of the first dictionary char category.
132      * If there are no Dictionary categories, set to the last category + 1.
133      */
134     int                fDictCategoriesStart;
135 
136     boolean             fSawBOF;
137 
138 
139     //------------------------------------------------------------------------
140     //
141     //       RBBISetBuilder Constructor
142     //
143     //------------------------------------------------------------------------
RBBISetBuilder(RBBIRuleBuilder rb)144     RBBISetBuilder(RBBIRuleBuilder rb)
145     {
146         fRB             = rb;
147     }
148 
149 
150     //------------------------------------------------------------------------
151     //
152     //           build          Build the list of non-overlapping character ranges
153     //                          from the Unicode Sets.
154     //
155     //------------------------------------------------------------------------
buildRanges()156     void buildRanges() {
157         RangeDescriptor rlRange;
158 
159         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
160 
161         //  Initialize the process by creating a single range encompassing all characters
162         //  that is in no sets.
163         //
164         fRangeList               = new RangeDescriptor();
165         fRangeList.fStartChar    = 0;
166         fRangeList.fEndChar      = 0x10ffff;
167 
168         //
169         //  Find the set of non-overlapping ranges of characters
170         //
171         for (RBBINode usetNode : fRB.fUSetNodes) {
172             UnicodeSet      inputSet             = usetNode.fInputSet;
173             int            inputSetRangeCount   = inputSet.getRangeCount();
174             int            inputSetRangeIndex   = 0;
175             rlRange              = fRangeList;
176 
177             for (;;) {
178                 if (inputSetRangeIndex >= inputSetRangeCount) {
179                     break;
180                 }
181                 int      inputSetRangeBegin  = inputSet.getRangeStart(inputSetRangeIndex);
182                 int      inputSetRangeEnd    = inputSet.getRangeEnd(inputSetRangeIndex);
183 
184                 // skip over ranges from the range list that are completely
185                 //   below the current range from the input unicode set.
186                 while (rlRange.fEndChar < inputSetRangeBegin) {
187                     rlRange = rlRange.fNext;
188                 }
189 
190                 // If the start of the range from the range list is before with
191                 //   the start of the range from the unicode set, split the range list range
192                 //   in two, with one part being before (wholly outside of) the unicode set
193                 //   and the other containing the rest.
194                 //   Then continue the loop; the post-split current range will then be skipped
195                 //     over
196                 if (rlRange.fStartChar < inputSetRangeBegin) {
197                     rlRange.split(inputSetRangeBegin);
198                      continue;
199                 }
200 
201                 // Same thing at the end of the ranges...
202                 // If the end of the range from the range list doesn't coincide with
203                 //   the end of the range from the unicode set, split the range list
204                 //   range in two.  The first part of the split range will be
205                 //   wholly inside the Unicode set.
206                 if (rlRange.fEndChar > inputSetRangeEnd) {
207                     rlRange.split(inputSetRangeEnd+1);
208                  }
209 
210                 // The current rlRange is now entirely within the UnicodeSet range.
211                 // Add this unicode set to the list of sets for this rlRange
212                 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
213                     rlRange.fIncludesSets.add(usetNode);
214                 }
215 
216                 // Advance over ranges that we are finished with.
217                 if (inputSetRangeEnd == rlRange.fEndChar) {
218                     inputSetRangeIndex++;
219                 }
220                 rlRange = rlRange.fNext;
221             }
222         }
223 
224         if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();}
225 
226         //
227         //  Group the above ranges, with each group consisting of one or more
228         //    ranges that are in exactly the same set of original UnicodeSets.
229         //    The groups are numbered, and these group numbers are the set of
230         //    input symbols recognized by the run-time state machine.
231         //
232         //    Numbering: # 0  (state table column 0) is unused.
233         //               # 1  is reserved - table column 1 is for end-of-input
234         //               # 2  is reserved - table column 2 is for beginning-of-input
235         //               # 3  is the first range list.
236         //
237         RangeDescriptor rlSearchRange;
238         int dictGroupCount = 0;
239 
240         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
241             for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
242                 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
243                     rlRange.fNum = rlSearchRange.fNum;
244                     rlRange.fIncludesDict = rlSearchRange.fIncludesDict;
245                     break;
246                 }
247             }
248             if (rlRange.fNum == 0) {
249                 rlRange.fFirstInGroup = true;
250                 if (rlRange.isDictionaryRange()) {
251                     rlRange.fNum = ++dictGroupCount;
252                     rlRange.fIncludesDict = true;
253                 } else {
254                     fGroupCount++;
255                     rlRange.fNum = fGroupCount + 2;
256                     addValToSets(rlRange.fIncludesSets, fGroupCount + 2);
257                 }
258             }
259         }
260 
261         // Move the character category numbers for any dictionary ranges up, so that they
262         // immediately follow the non-dictionary ranges.
263 
264         fDictCategoriesStart = fGroupCount + 3;
265         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
266             if (rlRange.fIncludesDict) {
267                 rlRange.fNum += fDictCategoriesStart - 1;
268                 if (rlRange.fFirstInGroup) {
269                     addValToSets(rlRange.fIncludesSets, rlRange.fNum);
270                 }
271             }
272         }
273         fGroupCount += dictGroupCount;
274 
275 
276 
277         // Handle input sets that contain the special string {eof}.
278         //   Column 1 of the state table is reserved for EOF on input.
279         //   Column 2 is reserved for before-the-start-input.
280         //            (This column can be optimized away later if there are no rule
281         //             references to {bof}.)
282         //   Add this column value (1 or 2) to the equivalent expression
283         //     subtree for each UnicodeSet that contains the string {eof}
284         //   Because {bof} and {eof} are not a characters in the normal sense,
285         //   they doesn't affect the computation of ranges or TRIE.
286 
287         String eofString = "eof";
288         String bofString = "bof";
289 
290         for (RBBINode usetNode : fRB.fUSetNodes) {
291             UnicodeSet      inputSet = usetNode.fInputSet;
292             if (inputSet.contains(eofString)) {
293                 addValToSet(usetNode, 1);
294             }
295             if (inputSet.contains(bofString)) {
296                 addValToSet(usetNode, 2);
297                 fSawBOF = true;
298             }
299         }
300 
301 
302         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
303         if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
304     }
305 
306 
307     private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255;
308 
309     /**
310      * Build the Trie table for mapping UChar32 values to the corresponding
311      * range group number.
312      */
buildTrie()313     void buildTrie() {
314         fTrie = new MutableCodePointTrie(0,       //   Initial value for all code points.
315                                          0);      //   Error value for out-of-range input.
316 
317         for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
318             fTrie.setRange(rlRange.fStartChar,     // Range start
319                            rlRange.fEndChar,       // Range end (inclusive)
320                            rlRange.fNum            // value for range
321                           );
322         }
323     }
324 
325     /**
326      * Merge two character categories that have been identified as having equivalent behavior.
327      * The ranges belonging to the second category (table column) will be added to the first.
328      * @param categories the pair of categories to be merged.
329      */
mergeCategories(IntPair categories)330     void mergeCategories(IntPair categories) {
331         assert(categories.first >= 1);
332         assert(categories.second > categories.first);
333         assert((categories.first <  fDictCategoriesStart && categories.second <  fDictCategoriesStart) ||
334                 (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart));
335         for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
336             int rangeNum = rd.fNum;
337             if (rangeNum == categories.second) {
338                 rd.fNum = categories.first;
339             } else if (rangeNum > categories.second) {
340                 rd.fNum--;
341             }
342         }
343         --fGroupCount;
344         if (categories.second <= fDictCategoriesStart) {
345             --fDictCategoriesStart;
346         }
347     }
348 
349     //-----------------------------------------------------------------------------------
350     //
351     //          freezeTrieIfNotYet()    Ensure the trie is frozen. Shared code by getTrieSize
352     //                                  and serializeTrie.
353     //
354     //-----------------------------------------------------------------------------------
freezeTrieIfNotYet()355     void freezeTrieIfNotYet()  {
356         if (fFrozenTrie == null) {
357             boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE;
358             fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST,
359                                                use8Bits ?
360                                                CodePointTrie.ValueWidth.BITS_8 :
361                                                CodePointTrie.ValueWidth.BITS_16);
362             fTrie = null;
363         }
364     }
365 
366     //-----------------------------------------------------------------------------------
367     //
368     //          getTrieSize()    Return the size that will be required to serialize the Trie.
369     //
370     //-----------------------------------------------------------------------------------
getTrieSize()371     int getTrieSize()  {
372         freezeTrieIfNotYet();
373         return fFrozenTrie.toBinary(new ByteArrayOutputStream());
374     }
375 
376 
377     //-----------------------------------------------------------------------------------
378     //
379     //          serializeTrie()   Write the serialized trie to an output stream
380     //
381     //-----------------------------------------------------------------------------------
serializeTrie(OutputStream os)382     void serializeTrie(OutputStream os) throws IOException {
383         freezeTrieIfNotYet();
384         fFrozenTrie.toBinary(os);
385    }
386 
387     //------------------------------------------------------------------------
388     //
389     //      addValToSets     Add a runtime-mapped input value to each uset from a
390     //      list of uset nodes. (val corresponds to a state table column.)
391     //      For each of the original Unicode sets - which correspond
392     //      directly to uset nodes - a logically equivalent expression
393     //      is constructed in terms of the remapped runtime input
394     //      symbol set.  This function adds one runtime input symbol to
395     //      a list of sets.
396     //
397     //      The "logically equivalent expression" is the tree for an
398     //      or-ing together of all of the symbols that go into the set.
399     //
400     //------------------------------------------------------------------------
addValToSets(List<RBBINode> sets, int val)401     void  addValToSets(List<RBBINode> sets, int val) {
402         for (RBBINode usetNode : sets) {
403             addValToSet(usetNode, val);
404         }
405     }
406 
addValToSet(RBBINode usetNode, int val)407     void  addValToSet(RBBINode usetNode, int val) {
408         RBBINode leafNode = new RBBINode(RBBINode.leafChar);
409         leafNode.fVal = val;
410         if (usetNode.fLeftChild == null) {
411             usetNode.fLeftChild = leafNode;
412             leafNode.fParent    = usetNode;
413         } else {
414             // There are already input symbols present for this set.
415             // Set up an OR node, with the previous stuff as the left child
416             //   and the new value as the right child.
417             RBBINode orNode = new RBBINode(RBBINode.opOr);
418             orNode.fLeftChild  = usetNode.fLeftChild;
419             orNode.fRightChild = leafNode;
420             orNode.fLeftChild.fParent  = orNode;
421             orNode.fRightChild.fParent = orNode;
422             usetNode.fLeftChild = orNode;
423             orNode.fParent = usetNode;
424         }
425     }
426 
427 
428     //------------------------------------------------------------------------
429     //
430     //           getNumCharCategories
431     //
432     //------------------------------------------------------------------------
getNumCharCategories()433     int  getNumCharCategories()  {
434         return fGroupCount + 3;
435     }
436 
437 
438     //------------------------------------------------------------------------
439     //
440     //   getDictCategoriesStart
441     //
442     //------------------------------------------------------------------------
getDictCategoriesStart()443     int  getDictCategoriesStart() {
444         return fDictCategoriesStart;
445     }
446 
447 
448     //------------------------------------------------------------------------
449     //
450     //           sawBOF
451     //
452     //------------------------------------------------------------------------
sawBOF()453     boolean  sawBOF()  {
454         return fSawBOF;
455     }
456 
457 
458     //------------------------------------------------------------------------
459     //
460     //           getFirstChar      Given a runtime RBBI character category, find
461     //                             the first UChar32 that is in the set of chars
462     //                             in the category.
463     //------------------------------------------------------------------------
getFirstChar(int category)464     int  getFirstChar(int category)  {
465         RangeDescriptor   rlRange;
466         int            retVal = -1;
467         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
468             if (rlRange.fNum == category) {
469                 retVal = rlRange.fStartChar;
470                 break;
471             }
472         }
473         return retVal;
474     }
475 
476 
477     //------------------------------------------------------------------------
478     //
479     //           printRanges        A debugging function.
480     //                              dump out all of the range definitions.
481     //
482     //------------------------------------------------------------------------
483     ///CLOVER:OFF
printRanges()484     void printRanges() {
485         RangeDescriptor       rlRange;
486         int                    i;
487 
488         System.out.print("\n\n Nonoverlapping Ranges ...\n");
489         for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
490             System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar);
491 
492             for (i=0; i<rlRange.fIncludesSets.size(); i++) {
493                 RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
494                 String         setName = "anon";
495                 RBBINode       setRef = usetNode.fParent;
496                 if (setRef != null) {
497                     RBBINode varRef = setRef.fParent;
498                     if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
499                         setName = varRef.fText;
500                     }
501                 }
502                 System.out.print(setName); System.out.print("  ");
503             }
504             System.out.println("");
505         }
506     }
507     ///CLOVER:ON
508 
509 
510     //------------------------------------------------------------------------
511     //
512     //           printRangeGroups     A debugging function.
513     //                                dump out all of the range groups.
514     //
515     //------------------------------------------------------------------------
516     ///CLOVER:OFF
printRangeGroups()517     void printRangeGroups() {
518         int                    i;
519 
520         System.out.print("\nRanges grouped by Unicode Set Membership...\n");
521         for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
522             if (rlRange.fFirstInGroup) {
523                 int groupNum = rlRange.fNum;
524                 if (groupNum<10) {System.out.print(" ");}
525                 System.out.print(groupNum + " ");
526 
527                 if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");}
528 
529                 for (i=0; i<rlRange.fIncludesSets.size(); i++) {
530                     RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
531                     String         setName = "anon";
532                     RBBINode       setRef = usetNode.fParent;
533                     if (setRef != null) {
534                         RBBINode varRef = setRef.fParent;
535                         if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
536                             setName = varRef.fText;
537                         }
538                     }
539                     System.out.print(setName); System.out.print(" ");
540                 }
541 
542                 i = 0;
543                 for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) {
544                     if (tRange.fNum == rlRange.fNum) {
545                         if (i++ % 5 == 0) {
546                             System.out.print("\n    ");
547                         }
548                         RBBINode.printHex(tRange.fStartChar, -1);
549                         System.out.print("-");
550                         RBBINode.printHex(tRange.fEndChar, 0);
551                     }
552                 }
553                 System.out.print("\n");
554             }
555         }
556         System.out.print("\n");
557     }
558     ///CLOVER:ON
559 
560 
561     //------------------------------------------------------------------------
562     //
563     //           printSets          A debugging function.
564     //                              dump out all of the set definitions.
565     //
566     //------------------------------------------------------------------------
567     ///CLOVER:OFF
printSets()568     void printSets() {
569         int                   i;
570         System.out.print("\n\nUnicode Sets List\n------------------\n");
571         for (i=0; i<fRB.fUSetNodes.size(); i++) {
572             RBBINode        usetNode;
573             RBBINode        setRef;
574             RBBINode        varRef;
575             String          setName;
576 
577             usetNode = fRB.fUSetNodes.get(i);
578 
579             //System.out.print(" " + i + "   ");
580             RBBINode.printInt(2, i);
581             setName = "anonymous";
582             setRef = usetNode.fParent;
583             if (setRef != null) {
584                 varRef = setRef.fParent;
585                 if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
586                     setName = varRef.fText;
587                 }
588             }
589             System.out.print("  " + setName);
590             System.out.print("   ");
591             System.out.print(usetNode.fText);
592             System.out.print("\n");
593             if (usetNode.fLeftChild != null) {
594                 usetNode.fLeftChild.printTree(true);
595             }
596         }
597         System.out.print("\n");
598     }
599     ///CLOVER:ON
600 }
601