• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 //
5 //    Copyright (C) 2002-2014, International Business Machines Corporation and others.
6 //    All Rights Reserved.
7 //
8 //
9 
10 package ohos.global.icu.text;
11 
12 import java.io.DataOutputStream;
13 import java.io.IOException;
14 import java.io.OutputStream;
15 import java.util.ArrayList;
16 import java.util.HashMap;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Set;
20 
21 import ohos.global.icu.impl.Assert;
22 import ohos.global.icu.impl.ICUBinary;
23 import ohos.global.icu.impl.ICUDebug;
24 import ohos.global.icu.impl.RBBIDataWrapper;
25 
26 class RBBIRuleBuilder {
27     //   This is the main class for building (compiling) break rules into the tables
28     //    required by the runtime RBBI engine.
29     //
30 
31     String fDebugEnv;              // controls debug trace output
32     String fRules;                 // The rule string that we are compiling
33     StringBuilder fStrippedRules;  // The rule string, with comments stripped.
34     RBBIRuleScanner fScanner;      // The scanner.
35 
36 
37     //
38     //  There are four separate parse trees generated, one for each of the
39     //    forward rules, reverse rules, safe forward rules and safe reverse rules.
40     //    This array references the root of each of the trees.
41     //    Only fForwardTree data is actually used to generate a state table.
42     //    The other three are retained for back compatibility with old rule files,
43     //    which may have safe and reverse rules. These are still parsed.
44     //
45     RBBINode[]         fTreeRoots = new RBBINode[4];
46     static final int   fForwardTree = 0;  // Indexes into the above fTreeRoots array
47     static final int   fReverseTree = 1;  //   for each of the trees.
48     static final int   fSafeFwdTree = 2;  //   (in C, these are pointer variables and
49     static final int   fSafeRevTree = 3;  //    there is no array.)
50     int fDefaultTree = fForwardTree;      // For rules not qualified with a !
51                                           //   the tree to which they belong to.
52 
53     boolean fChainRules;                  // True for chained Unicode TR style rules.
54                                           // False for traditional regexp rules.
55 
56     boolean fLBCMNoChain;                 // True:  suppress chaining of rules on
57                                           //   chars with LineBreak property == CM.
58 
59     boolean fLookAheadHardBreak;          // True:  Look ahead matches cause an
60                                           // immediate break, no continuing for the
61                                           // longest match.
62 
63     RBBISetBuilder fSetBuilder;           // Set and Character Category builder.
64     List<RBBINode> fUSetNodes;            // Vector of all used nodes.
65     RBBITableBuilder fForwardTable;       // State transition tables
66 
67     //
68     // Status {tag} values.   These structures are common to all of the rule sets (Forward, Reverse, etc.).
69     //
70     Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far.
71                                                                                    //  Map Key is the set of values.
72                                                                                    //  Map Value is the runtime array index.
73 
74     List<Integer> fRuleStatusVals;        // List of Integer objects.  Has same layout as the
75                                           //   runtime array of status (tag) values -
76                                           //     number of values in group 1
77                                           //        first status value in group 1
78                                           //        2nd status value in group 1
79                                           //        ...
80                                           //     number of values in group 2
81                                           //        first status value in group 2
82                                           //        etc.
83                                           //
84     // Error codes from ICU4C.
85     //    using these simplified the porting, and consolidated the
86     //    creation of Java exceptions
87     //
88     static final int U_BRK_ERROR_START = 0x10200;
89     /**< Start of codes indicating Break Iterator failures */
90 
91     static final int U_BRK_INTERNAL_ERROR = 0x10201;
92     /**< An internal error (bug) was detected.             */
93 
94     static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202;
95     /**< Hex digits expected as part of a escaped char in a rule. */
96 
97     static final int U_BRK_SEMICOLON_EXPECTED = 0x10203;
98     /**< Missing ';' at the end of a RBBI rule.            */
99 
100     static final int U_BRK_RULE_SYNTAX = 0x10204;
101     /**< Syntax error in RBBI rule.                        */
102 
103     static final int U_BRK_UNCLOSED_SET = 0x10205;
104     /**< UnicodeSet writing an RBBI rule missing a closing ']'.  */
105 
106     static final int U_BRK_ASSIGN_ERROR = 0x10206;
107     /**< Syntax error in RBBI rule assignment statement.   */
108 
109     static final int U_BRK_VARIABLE_REDFINITION = 0x10207;
110     /**< RBBI rule $Variable redefined.                    */
111 
112     static final int U_BRK_MISMATCHED_PAREN = 0x10208;
113     /**< Mis-matched parentheses in an RBBI rule.          */
114 
115     static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209;
116     /**< Missing closing quote in an RBBI rule.            */
117 
118     static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a;
119     /**< Use of an undefined $Variable in an RBBI rule.    */
120 
121     static final int U_BRK_INIT_ERROR = 0x1020b;
122     /**< Initialization failure.  Probable missing ICU Data. */
123 
124     static final int U_BRK_RULE_EMPTY_SET = 0x1020c;
125     /**< Rule contains an empty Unicode Set.               */
126 
127     static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d;
128     /**< !!option in RBBI rules not recognized.            */
129 
130     static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e;
131     /**< The {nnn} tag on a rule is mal formed             */
132     static final int U_BRK_MALFORMED_SET = 0x1020f;
133 
134     static final int U_BRK_ERROR_LIMIT = 0x10210;
135     /**< This must always be the last value to indicate the limit for Break Iterator failures */
136 
137 
138     //----------------------------------------------------------------------------------------
139     //
140     //  Constructor.
141     //
142     //----------------------------------------------------------------------------------------
RBBIRuleBuilder(String rules)143     RBBIRuleBuilder(String rules)
144     {
145         fDebugEnv       = ICUDebug.enabled("rbbi") ?
146                             ICUDebug.value("rbbi") : null;
147         fRules          = rules;
148         fStrippedRules  = new StringBuilder(rules);
149         fUSetNodes      = new ArrayList<RBBINode>();
150         fRuleStatusVals = new ArrayList<Integer>();
151         fScanner        = new RBBIRuleScanner(this);
152         fSetBuilder     = new RBBISetBuilder(this);
153     }
154 
155     //----------------------------------------------------------------------------------------
156     //
157     //   flattenData() -  Collect up the compiled RBBI rule data and put it into
158     //                    the format for saving in ICU data files,
159     //
160     //                    See the ICU4C file common/rbidata.h for a detailed description.
161     //
162     //----------------------------------------------------------------------------------------
align8(int i)163     static final int align8(int i)
164     {
165         return (i + 7) & 0xfffffff8;
166     }
167 
flattenData(OutputStream os)168     void flattenData(OutputStream os) throws IOException {
169         DataOutputStream dos = new DataOutputStream(os);
170         int i;
171 
172         //  Remove whitespace from the rules to make it smaller.
173         //  The rule parser has already removed comments.
174         String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString());
175 
176         // Calculate the size of each section in the data in bytes.
177         //   Sizes here are padded up to a multiple of 8 for better memory alignment.
178         //   Sections sizes actually stored in the header are for the actual data
179         //     without the padding.
180         //
181         int headerSize       = RBBIDataWrapper.DH_SIZE * 4;     // align8(sizeof(RBBIDataHeader));
182         int forwardTableSize = align8(fForwardTable.getTableSize());
183         int reverseTableSize = align8(fForwardTable.getSafeTableSize());
184         int trieSize         = align8(fSetBuilder.getTrieSize());
185         int statusTableSize  = align8(fRuleStatusVals.size() * 4);
186         int rulesSize        = align8((strippedRules.length()) * 2);
187 
188         int totalSize = headerSize
189                 + forwardTableSize
190                 + reverseTableSize
191                 + statusTableSize + trieSize + rulesSize;
192         int outputPos = 0;               // Track stream position, starting from RBBIDataHeader.
193 
194         //
195         // Write out an ICU Data Header
196         //
197         ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos);
198 
199         //
200         // Write out the RBBIDataHeader
201         //
202         int[] header = new int[RBBIDataWrapper.DH_SIZE];                 // sizeof struct RBBIDataHeader
203         header[RBBIDataWrapper.DH_MAGIC]         = 0xb1a0;
204         header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION;
205         header[RBBIDataWrapper.DH_LENGTH]        = totalSize;            // fLength, the total size of all rule sections.
206         header[RBBIDataWrapper.DH_CATCOUNT]      = fSetBuilder.getNumCharCategories(); // fCatCount.
207 
208         header[RBBIDataWrapper.DH_FTABLE]        = headerSize;           // fFTable
209         header[RBBIDataWrapper.DH_FTABLELEN]     = forwardTableSize;     // fTableLen
210 
211         header[RBBIDataWrapper.DH_RTABLE]        = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
212         header[RBBIDataWrapper.DH_RTABLELEN]     = reverseTableSize;     // fRTableLen
213 
214         header[RBBIDataWrapper.DH_TRIE]          = header[RBBIDataWrapper.DH_RTABLE]
215                                                      + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
216         header[RBBIDataWrapper.DH_TRIELEN]       = fSetBuilder.getTrieSize(); // fTrieLen
217         header[RBBIDataWrapper.DH_STATUSTABLE]   = header[RBBIDataWrapper.DH_TRIE]
218                                                      + header[RBBIDataWrapper.DH_TRIELEN];
219         header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen
220         header[RBBIDataWrapper.DH_RULESOURCE]    = header[RBBIDataWrapper.DH_STATUSTABLE]
221                                                      + statusTableSize;
222         header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2;
223         for (i = 0; i < header.length; i++) {
224             dos.writeInt(header[i]);
225             outputPos += 4;
226         }
227 
228         // Write out the actual state tables.
229         RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
230         assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
231         outputPos += table.put(dos);
232 
233         table = fForwardTable.exportSafeTable();
234         Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
235         outputPos += table.put(dos);
236 
237         // write out the Trie table
238         Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
239         fSetBuilder.serializeTrie(os);
240         outputPos += header[RBBIDataWrapper.DH_TRIELEN];
241         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
242             dos.write(0);
243             outputPos += 1;
244         }
245 
246         // Write out the status {tag} table.
247         Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
248         for (Integer val : fRuleStatusVals) {
249             dos.writeInt(val.intValue());
250             outputPos += 4;
251         }
252 
253         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
254             dos.write(0);
255             outputPos += 1;
256         }
257 
258         // Write out the stripped rules (rules with extra spaces removed
259         //   These go last in the data area, even though they are not last in the header.
260         Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
261         dos.writeChars(strippedRules);
262         outputPos += strippedRules.length() * 2;
263         while (outputPos % 8 != 0) { // pad to an 8 byte boundary
264             dos.write(0);
265             outputPos += 1;
266         }
267     }
268 
269     //----------------------------------------------------------------------------------------
270     //
271     //  compileRules          compile source rules, placing the compiled form into a output stream
272     //                        The compiled form is identical to that from ICU4C (Big Endian).
273     //
274     //----------------------------------------------------------------------------------------
compileRules(String rules, OutputStream os)275     static void compileRules(String rules, OutputStream os) throws IOException
276     {
277         //
278         // Read the input rules, generate a parse tree, symbol table,
279         // and list of all Unicode Sets referenced by the rules.
280         //
281         RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
282         builder.build(os);
283     }
284 
285     /**
286      * Compile rules to the binary form, write that to an ouput stream.
287      *
288      */
build(OutputStream os)289     void build(OutputStream os) throws IOException {
290         fScanner.parse();
291 
292         //
293         // UnicodeSet processing.
294         //    Munge the Unicode Sets to create a set of character categories.
295         //    Generate the mapping tables (TRIE) from input code points to
296         //    the character categories.
297         //
298         fSetBuilder.buildRanges();
299 
300         //
301         //   Generate the DFA state transition table.
302         //
303         fForwardTable = new RBBITableBuilder(this, fForwardTree);
304         fForwardTable.buildForwardTable();
305         optimizeTables();
306         fForwardTable.buildSafeReverseTable();
307 
308 
309         if (fDebugEnv != null
310                 && fDebugEnv.indexOf("states") >= 0) {
311             fForwardTable.printStates();
312             fForwardTable.printRuleStatusTable();
313             fForwardTable.printReverseTable();
314         }
315 
316         fSetBuilder.buildTrie();
317         //
318         //   Package up the compiled data, writing it to an output stream
319         //      in the serialization format.  This is the same as the ICU4C runtime format.
320         //
321         flattenData(os);
322     }
323 
324     static class IntPair {
325         int first = 0;
326         int second = 0;
IntPair()327         IntPair() {};
IntPair(int f, int s)328         IntPair(int f, int s) {
329             first = f;
330             second = s;
331         }
332     }
333 
optimizeTables()334     void optimizeTables() {
335         boolean didSomething;
336         do {
337             didSomething = false;
338             // Begin looking for duplicates with char class 3.
339             // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
340             // and should not have other categories merged into them.
341             IntPair duplPair = new IntPair(3, 0);
342             while (fForwardTable.findDuplCharClassFrom(duplPair)) {
343                 fSetBuilder.mergeCategories(duplPair);
344                 fForwardTable.removeColumn(duplPair.second);
345                 didSomething = true;
346             }
347             while (fForwardTable.removeDuplicateStates() > 0) {
348                 didSomething = true;
349             };
350         } while (didSomething);
351     }
352 }
353