1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 // 5 // Copyright (C) 2002-2014, International Business Machines Corporation and others. 6 // All Rights Reserved. 7 // 8 // 9 10 package ohos.global.icu.text; 11 12 import java.io.DataOutputStream; 13 import java.io.IOException; 14 import java.io.OutputStream; 15 import java.util.ArrayList; 16 import java.util.HashMap; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Set; 20 21 import ohos.global.icu.impl.Assert; 22 import ohos.global.icu.impl.ICUBinary; 23 import ohos.global.icu.impl.ICUDebug; 24 import ohos.global.icu.impl.RBBIDataWrapper; 25 26 class RBBIRuleBuilder { 27 // This is the main class for building (compiling) break rules into the tables 28 // required by the runtime RBBI engine. 29 // 30 31 String fDebugEnv; // controls debug trace output 32 String fRules; // The rule string that we are compiling 33 StringBuilder fStrippedRules; // The rule string, with comments stripped. 34 RBBIRuleScanner fScanner; // The scanner. 35 36 37 // 38 // There are four separate parse trees generated, one for each of the 39 // forward rules, reverse rules, safe forward rules and safe reverse rules. 40 // This array references the root of each of the trees. 41 // Only fForwardTree data is actually used to generate a state table. 42 // The other three are retained for back compatibility with old rule files, 43 // which may have safe and reverse rules. These are still parsed. 44 // 45 RBBINode[] fTreeRoots = new RBBINode[4]; 46 static final int fForwardTree = 0; // Indexes into the above fTreeRoots array 47 static final int fReverseTree = 1; // for each of the trees. 48 static final int fSafeFwdTree = 2; // (in C, these are pointer variables and 49 static final int fSafeRevTree = 3; // there is no array.) 50 int fDefaultTree = fForwardTree; // For rules not qualified with a ! 51 // the tree to which they belong to. 52 53 boolean fChainRules; // True for chained Unicode TR style rules. 54 // False for traditional regexp rules. 55 56 boolean fLBCMNoChain; // True: suppress chaining of rules on 57 // chars with LineBreak property == CM. 58 59 boolean fLookAheadHardBreak; // True: Look ahead matches cause an 60 // immediate break, no continuing for the 61 // longest match. 62 63 RBBISetBuilder fSetBuilder; // Set and Character Category builder. 64 List<RBBINode> fUSetNodes; // Vector of all used nodes. 65 RBBITableBuilder fForwardTable; // State transition tables 66 67 // 68 // Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.). 69 // 70 Map<Set<Integer>, Integer> fStatusSets = new HashMap<Set<Integer>, Integer>(); // Status value sets encountered so far. 71 // Map Key is the set of values. 72 // Map Value is the runtime array index. 73 74 List<Integer> fRuleStatusVals; // List of Integer objects. Has same layout as the 75 // runtime array of status (tag) values - 76 // number of values in group 1 77 // first status value in group 1 78 // 2nd status value in group 1 79 // ... 80 // number of values in group 2 81 // first status value in group 2 82 // etc. 83 // 84 // Error codes from ICU4C. 85 // using these simplified the porting, and consolidated the 86 // creation of Java exceptions 87 // 88 static final int U_BRK_ERROR_START = 0x10200; 89 /**< Start of codes indicating Break Iterator failures */ 90 91 static final int U_BRK_INTERNAL_ERROR = 0x10201; 92 /**< An internal error (bug) was detected. */ 93 94 static final int U_BRK_HEX_DIGITS_EXPECTED = 0x10202; 95 /**< Hex digits expected as part of a escaped char in a rule. */ 96 97 static final int U_BRK_SEMICOLON_EXPECTED = 0x10203; 98 /**< Missing ';' at the end of a RBBI rule. */ 99 100 static final int U_BRK_RULE_SYNTAX = 0x10204; 101 /**< Syntax error in RBBI rule. */ 102 103 static final int U_BRK_UNCLOSED_SET = 0x10205; 104 /**< UnicodeSet writing an RBBI rule missing a closing ']'. */ 105 106 static final int U_BRK_ASSIGN_ERROR = 0x10206; 107 /**< Syntax error in RBBI rule assignment statement. */ 108 109 static final int U_BRK_VARIABLE_REDFINITION = 0x10207; 110 /**< RBBI rule $Variable redefined. */ 111 112 static final int U_BRK_MISMATCHED_PAREN = 0x10208; 113 /**< Mis-matched parentheses in an RBBI rule. */ 114 115 static final int U_BRK_NEW_LINE_IN_QUOTED_STRING = 0x10209; 116 /**< Missing closing quote in an RBBI rule. */ 117 118 static final int U_BRK_UNDEFINED_VARIABLE = 0x1020a; 119 /**< Use of an undefined $Variable in an RBBI rule. */ 120 121 static final int U_BRK_INIT_ERROR = 0x1020b; 122 /**< Initialization failure. Probable missing ICU Data. */ 123 124 static final int U_BRK_RULE_EMPTY_SET = 0x1020c; 125 /**< Rule contains an empty Unicode Set. */ 126 127 static final int U_BRK_UNRECOGNIZED_OPTION = 0x1020d; 128 /**< !!option in RBBI rules not recognized. */ 129 130 static final int U_BRK_MALFORMED_RULE_TAG = 0x1020e; 131 /**< The {nnn} tag on a rule is mal formed */ 132 static final int U_BRK_MALFORMED_SET = 0x1020f; 133 134 static final int U_BRK_ERROR_LIMIT = 0x10210; 135 /**< This must always be the last value to indicate the limit for Break Iterator failures */ 136 137 138 //---------------------------------------------------------------------------------------- 139 // 140 // Constructor. 141 // 142 //---------------------------------------------------------------------------------------- RBBIRuleBuilder(String rules)143 RBBIRuleBuilder(String rules) 144 { 145 fDebugEnv = ICUDebug.enabled("rbbi") ? 146 ICUDebug.value("rbbi") : null; 147 fRules = rules; 148 fStrippedRules = new StringBuilder(rules); 149 fUSetNodes = new ArrayList<RBBINode>(); 150 fRuleStatusVals = new ArrayList<Integer>(); 151 fScanner = new RBBIRuleScanner(this); 152 fSetBuilder = new RBBISetBuilder(this); 153 } 154 155 //---------------------------------------------------------------------------------------- 156 // 157 // flattenData() - Collect up the compiled RBBI rule data and put it into 158 // the format for saving in ICU data files, 159 // 160 // See the ICU4C file common/rbidata.h for a detailed description. 161 // 162 //---------------------------------------------------------------------------------------- align8(int i)163 static final int align8(int i) 164 { 165 return (i + 7) & 0xfffffff8; 166 } 167 flattenData(OutputStream os)168 void flattenData(OutputStream os) throws IOException { 169 DataOutputStream dos = new DataOutputStream(os); 170 int i; 171 172 // Remove whitespace from the rules to make it smaller. 173 // The rule parser has already removed comments. 174 String strippedRules = RBBIRuleScanner.stripRules(fStrippedRules.toString()); 175 176 // Calculate the size of each section in the data in bytes. 177 // Sizes here are padded up to a multiple of 8 for better memory alignment. 178 // Sections sizes actually stored in the header are for the actual data 179 // without the padding. 180 // 181 int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader)); 182 int forwardTableSize = align8(fForwardTable.getTableSize()); 183 int reverseTableSize = align8(fForwardTable.getSafeTableSize()); 184 int trieSize = align8(fSetBuilder.getTrieSize()); 185 int statusTableSize = align8(fRuleStatusVals.size() * 4); 186 int rulesSize = align8((strippedRules.length()) * 2); 187 188 int totalSize = headerSize 189 + forwardTableSize 190 + reverseTableSize 191 + statusTableSize + trieSize + rulesSize; 192 int outputPos = 0; // Track stream position, starting from RBBIDataHeader. 193 194 // 195 // Write out an ICU Data Header 196 // 197 ICUBinary.writeHeader(RBBIDataWrapper.DATA_FORMAT, RBBIDataWrapper.FORMAT_VERSION, 0, dos); 198 199 // 200 // Write out the RBBIDataHeader 201 // 202 int[] header = new int[RBBIDataWrapper.DH_SIZE]; // sizeof struct RBBIDataHeader 203 header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0; 204 header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; 205 header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. 206 header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount. 207 208 header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable 209 header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen 210 211 header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable 212 header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen 213 214 header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE] 215 + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie 216 header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen 217 header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] 218 + header[RBBIDataWrapper.DH_TRIELEN]; 219 header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen 220 header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE] 221 + statusTableSize; 222 header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2; 223 for (i = 0; i < header.length; i++) { 224 dos.writeInt(header[i]); 225 outputPos += 4; 226 } 227 228 // Write out the actual state tables. 229 RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable(); 230 assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]); 231 outputPos += table.put(dos); 232 233 table = fForwardTable.exportSafeTable(); 234 Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]); 235 outputPos += table.put(dos); 236 237 // write out the Trie table 238 Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]); 239 fSetBuilder.serializeTrie(os); 240 outputPos += header[RBBIDataWrapper.DH_TRIELEN]; 241 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 242 dos.write(0); 243 outputPos += 1; 244 } 245 246 // Write out the status {tag} table. 247 Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]); 248 for (Integer val : fRuleStatusVals) { 249 dos.writeInt(val.intValue()); 250 outputPos += 4; 251 } 252 253 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 254 dos.write(0); 255 outputPos += 1; 256 } 257 258 // Write out the stripped rules (rules with extra spaces removed 259 // These go last in the data area, even though they are not last in the header. 260 Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]); 261 dos.writeChars(strippedRules); 262 outputPos += strippedRules.length() * 2; 263 while (outputPos % 8 != 0) { // pad to an 8 byte boundary 264 dos.write(0); 265 outputPos += 1; 266 } 267 } 268 269 //---------------------------------------------------------------------------------------- 270 // 271 // compileRules compile source rules, placing the compiled form into a output stream 272 // The compiled form is identical to that from ICU4C (Big Endian). 273 // 274 //---------------------------------------------------------------------------------------- compileRules(String rules, OutputStream os)275 static void compileRules(String rules, OutputStream os) throws IOException 276 { 277 // 278 // Read the input rules, generate a parse tree, symbol table, 279 // and list of all Unicode Sets referenced by the rules. 280 // 281 RBBIRuleBuilder builder = new RBBIRuleBuilder(rules); 282 builder.build(os); 283 } 284 285 /** 286 * Compile rules to the binary form, write that to an ouput stream. 287 * 288 */ build(OutputStream os)289 void build(OutputStream os) throws IOException { 290 fScanner.parse(); 291 292 // 293 // UnicodeSet processing. 294 // Munge the Unicode Sets to create a set of character categories. 295 // Generate the mapping tables (TRIE) from input code points to 296 // the character categories. 297 // 298 fSetBuilder.buildRanges(); 299 300 // 301 // Generate the DFA state transition table. 302 // 303 fForwardTable = new RBBITableBuilder(this, fForwardTree); 304 fForwardTable.buildForwardTable(); 305 optimizeTables(); 306 fForwardTable.buildSafeReverseTable(); 307 308 309 if (fDebugEnv != null 310 && fDebugEnv.indexOf("states") >= 0) { 311 fForwardTable.printStates(); 312 fForwardTable.printRuleStatusTable(); 313 fForwardTable.printReverseTable(); 314 } 315 316 fSetBuilder.buildTrie(); 317 // 318 // Package up the compiled data, writing it to an output stream 319 // in the serialization format. This is the same as the ICU4C runtime format. 320 // 321 flattenData(os); 322 } 323 324 static class IntPair { 325 int first = 0; 326 int second = 0; IntPair()327 IntPair() {}; IntPair(int f, int s)328 IntPair(int f, int s) { 329 first = f; 330 second = s; 331 } 332 } 333 optimizeTables()334 void optimizeTables() { 335 boolean didSomething; 336 do { 337 didSomething = false; 338 // Begin looking for duplicates with char class 3. 339 // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, 340 // and should not have other categories merged into them. 341 IntPair duplPair = new IntPair(3, 0); 342 while (fForwardTable.findDuplCharClassFrom(duplPair)) { 343 fSetBuilder.mergeCategories(duplPair); 344 fForwardTable.removeColumn(duplPair.second); 345 didSomething = true; 346 } 347 while (fForwardTable.removeDuplicateStates() > 0) { 348 didSomething = true; 349 }; 350 } while (didSomething); 351 } 352 } 353