1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2011, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 import java.io.IOException; 13 import java.io.OutputStream; 14 import java.util.ArrayList; 15 import java.util.List; 16 17 import ohos.global.icu.impl.Assert; 18 import ohos.global.icu.impl.Trie2Writable; 19 import ohos.global.icu.impl.Trie2_16; 20 import ohos.global.icu.text.RBBIRuleBuilder.IntPair; 21 22 // 23 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 24 // (part of the rule building process.) 25 // 26 // Starting with the rules parse tree from the scanner, 27 // 28 // - Enumerate the set of UnicodeSets that are referenced 29 // by the RBBI rules. 30 // - compute a set of non-overlapping character ranges 31 // with all characters within a range belonging to the same 32 // set of input uniocde sets. 33 // - Derive a set of non-overlapping UnicodeSet (like things) 34 // that will correspond to columns in the state table for 35 // the RBBI execution engine. All characters within one 36 // of these sets belong to the same set of the original 37 // UnicodeSets from the user's rules. 38 // - construct the trie table that maps input characters 39 // to the index of the matching non-overlapping set of set from 40 // the previous step. 41 // 42 class RBBISetBuilder { 43 static class RangeDescriptor { 44 int fStartChar; // Start of range, unicode 32 bit value. 45 int fEndChar; // End of range, unicode 32 bit value. 46 int fNum; // runtime-mapped input value for this range. 47 List<RBBINode> fIncludesSets; // vector of the the original 48 // Unicode sets that include this range. 49 // (Contains ptrs to uset nodes) 50 RangeDescriptor fNext; // Next RangeDescriptor in the linked list. 51 RangeDescriptor()52 RangeDescriptor() { 53 fIncludesSets = new ArrayList<RBBINode>(); 54 } 55 RangeDescriptor(RangeDescriptor other)56 RangeDescriptor(RangeDescriptor other) { 57 fStartChar = other.fStartChar; 58 fEndChar = other.fEndChar; 59 fNum = other.fNum; 60 fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets); 61 } 62 63 //------------------------------------------------------------------------------------- 64 // 65 // RangeDesriptor::split() 66 // 67 //------------------------------------------------------------------------------------- split(int where)68 void split(int where) { 69 Assert.assrt(where>fStartChar && where<=fEndChar); 70 RangeDescriptor nr = new RangeDescriptor(this); 71 72 // RangeDescriptor copy constructor copies all fields. 73 // Only need to update those that are different after the split. 74 nr.fStartChar = where; 75 this.fEndChar = where-1; 76 nr.fNext = this.fNext; 77 this.fNext = nr; 78 79 // TODO: fIncludesSets is not updated. Check it out. 80 // Probably because they haven't been populated yet, 81 // but still sloppy. 82 } 83 84 85 //------------------------------------------------------------------------------------- 86 // 87 // RangeDescriptor::setDictionaryFlag 88 // 89 // Character Category Numbers that include characters from 90 // the original Unicode Set named "dictionary" have bit 14 91 // set to 1. The RBBI runtime engine uses this to trigger 92 // use of the word dictionary. 93 // 94 // This function looks through the Unicode Sets that it 95 // (the range) includes, and sets the bit in fNum when 96 // "dictionary" is among them. 97 // 98 // TODO: a faster way would be to find the set node for 99 // "dictionary" just once, rather than looking it 100 // up by name every time. 101 // 102 // ------------------------------------------------------------------------------------- setDictionaryFlag()103 void setDictionaryFlag() { 104 int i; 105 106 for (i=0; i<this.fIncludesSets.size(); i++) { 107 RBBINode usetNode = fIncludesSets.get(i); 108 String setName = ""; 109 RBBINode setRef = usetNode.fParent; 110 if (setRef != null) { 111 RBBINode varRef = setRef.fParent; 112 if (varRef != null && varRef.fType == RBBINode.varRef) { 113 setName = varRef.fText; 114 } 115 } 116 if (setName.equals("dictionary")) { 117 this.fNum |= DICT_BIT; 118 break; 119 } 120 } 121 122 } 123 } 124 125 126 RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us. 127 RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors 128 129 Trie2Writable fTrie; // The mapping TRIE that is the end result of processing 130 // the Unicode Sets. 131 Trie2_16 fFrozenTrie; 132 133 // Groups correspond to character categories - 134 // groups of ranges that are in the same original UnicodeSets. 135 // fGroupCount is the index of the last used group. 136 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 137 // State table column 0 is not used. Column 1 is for end-of-input. 138 // column 2 is for group 0. Funny counting. 139 int fGroupCount; 140 141 boolean fSawBOF; 142 143 static final int DICT_BIT = 0x4000; 144 145 146 //------------------------------------------------------------------------ 147 // 148 // RBBISetBuilder Constructor 149 // 150 //------------------------------------------------------------------------ RBBISetBuilder(RBBIRuleBuilder rb)151 RBBISetBuilder(RBBIRuleBuilder rb) 152 { 153 fRB = rb; 154 } 155 156 157 //------------------------------------------------------------------------ 158 // 159 // build Build the list of non-overlapping character ranges 160 // from the Unicode Sets. 161 // 162 //------------------------------------------------------------------------ buildRanges()163 void buildRanges() { 164 RangeDescriptor rlRange; 165 166 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();} 167 168 // Initialize the process by creating a single range encompassing all characters 169 // that is in no sets. 170 // 171 fRangeList = new RangeDescriptor(); 172 fRangeList.fStartChar = 0; 173 fRangeList.fEndChar = 0x10ffff; 174 175 // 176 // Find the set of non-overlapping ranges of characters 177 // 178 for (RBBINode usetNode : fRB.fUSetNodes) { 179 UnicodeSet inputSet = usetNode.fInputSet; 180 int inputSetRangeCount = inputSet.getRangeCount(); 181 int inputSetRangeIndex = 0; 182 rlRange = fRangeList; 183 184 for (;;) { 185 if (inputSetRangeIndex >= inputSetRangeCount) { 186 break; 187 } 188 int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex); 189 int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex); 190 191 // skip over ranges from the range list that are completely 192 // below the current range from the input unicode set. 193 while (rlRange.fEndChar < inputSetRangeBegin) { 194 rlRange = rlRange.fNext; 195 } 196 197 // If the start of the range from the range list is before with 198 // the start of the range from the unicode set, split the range list range 199 // in two, with one part being before (wholly outside of) the unicode set 200 // and the other containing the rest. 201 // Then continue the loop; the post-split current range will then be skipped 202 // over 203 if (rlRange.fStartChar < inputSetRangeBegin) { 204 rlRange.split(inputSetRangeBegin); 205 continue; 206 } 207 208 // Same thing at the end of the ranges... 209 // If the end of the range from the range list doesn't coincide with 210 // the end of the range from the unicode set, split the range list 211 // range in two. The first part of the split range will be 212 // wholly inside the Unicode set. 213 if (rlRange.fEndChar > inputSetRangeEnd) { 214 rlRange.split(inputSetRangeEnd+1); 215 } 216 217 // The current rlRange is now entirely within the UnicodeSet range. 218 // Add this unicode set to the list of sets for this rlRange 219 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) { 220 rlRange.fIncludesSets.add(usetNode); 221 } 222 223 // Advance over ranges that we are finished with. 224 if (inputSetRangeEnd == rlRange.fEndChar) { 225 inputSetRangeIndex++; 226 } 227 rlRange = rlRange.fNext; 228 } 229 } 230 231 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();} 232 233 // 234 // Group the above ranges, with each group consisting of one or more 235 // ranges that are in exactly the same set of original UnicodeSets. 236 // The groups are numbered, and these group numbers are the set of 237 // input symbols recognized by the run-time state machine. 238 // 239 // Numbering: # 0 (state table column 0) is unused. 240 // # 1 is reserved - table column 1 is for end-of-input 241 // # 2 is reserved - table column 2 is for beginning-in-input 242 // # 3 is the first range list. 243 // 244 RangeDescriptor rlSearchRange; 245 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 246 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) { 247 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) { 248 rlRange.fNum = rlSearchRange.fNum; 249 break; 250 } 251 } 252 if (rlRange.fNum == 0) { 253 fGroupCount ++; 254 rlRange.fNum = fGroupCount+2; 255 rlRange.setDictionaryFlag(); 256 addValToSets(rlRange.fIncludesSets, fGroupCount+2); 257 } 258 } 259 260 // Handle input sets that contain the special string {eof}. 261 // Column 1 of the state table is reserved for EOF on input. 262 // Column 2 is reserved for before-the-start-input. 263 // (This column can be optimized away later if there are no rule 264 // references to {bof}.) 265 // Add this column value (1 or 2) to the equivalent expression 266 // subtree for each UnicodeSet that contains the string {eof} 267 // Because {bof} and {eof} are not a characters in the normal sense, 268 // they doesn't affect the computation of ranges or TRIE. 269 270 String eofString = "eof"; 271 String bofString = "bof"; 272 273 for (RBBINode usetNode : fRB.fUSetNodes) { 274 UnicodeSet inputSet = usetNode.fInputSet; 275 if (inputSet.contains(eofString)) { 276 addValToSet(usetNode, 1); 277 } 278 if (inputSet.contains(bofString)) { 279 addValToSet(usetNode, 2); 280 fSawBOF = true; 281 } 282 } 283 284 285 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} 286 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} 287 } 288 289 290 /** 291 * Build the Trie table for mapping UChar32 values to the corresponding 292 * range group number. 293 */ buildTrie()294 void buildTrie() { 295 RangeDescriptor rlRange; 296 297 fTrie = new Trie2Writable(0, // Initial value for all code points. 298 0); // Error value for out-of-range input. 299 300 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 301 fTrie.setRange( 302 rlRange.fStartChar, // Range start 303 rlRange.fEndChar, // Range end (inclusive) 304 rlRange.fNum, // value for range 305 true // Overwrite previously written values 306 ); 307 } 308 } 309 310 /** 311 * Merge two character categories that have been identified as having equivalent behavior. 312 * The ranges belonging to the second category (table column) will be added to the first. 313 * @param categories the pair of categories to be merged. 314 */ mergeCategories(IntPair categories)315 void mergeCategories(IntPair categories) { 316 assert(categories.first >= 1); 317 assert(categories.second > categories.first); 318 for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) { 319 int rangeNum = rd.fNum & ~DICT_BIT; 320 int rangeDict = rd.fNum & DICT_BIT; 321 if (rangeNum == categories.second) { 322 rd.fNum = categories.first | rangeDict; 323 } else if (rangeNum > categories.second) { 324 rd.fNum--; 325 } 326 } 327 --fGroupCount; 328 } 329 330 //----------------------------------------------------------------------------------- 331 // 332 // getTrieSize() Return the size that will be required to serialize the Trie. 333 // 334 //----------------------------------------------------------------------------------- getTrieSize()335 int getTrieSize() { 336 if (fFrozenTrie == null) { 337 fFrozenTrie = fTrie.toTrie2_16(); 338 fTrie = null; 339 } 340 return fFrozenTrie.getSerializedLength(); 341 } 342 343 344 //----------------------------------------------------------------------------------- 345 // 346 // serializeTrie() Write the serialized trie to an output stream 347 // 348 //----------------------------------------------------------------------------------- serializeTrie(OutputStream os)349 void serializeTrie(OutputStream os) throws IOException { 350 if (fFrozenTrie == null) { 351 fFrozenTrie = fTrie.toTrie2_16(); 352 fTrie = null; 353 } 354 fFrozenTrie.serialize(os); 355 } 356 357 //------------------------------------------------------------------------ 358 // 359 // addValToSets Add a runtime-mapped input value to each uset from a 360 // list of uset nodes. (val corresponds to a state table column.) 361 // For each of the original Unicode sets - which correspond 362 // directly to uset nodes - a logically equivalent expression 363 // is constructed in terms of the remapped runtime input 364 // symbol set. This function adds one runtime input symbol to 365 // a list of sets. 366 // 367 // The "logically equivalent expression" is the tree for an 368 // or-ing together of all of the symbols that go into the set. 369 // 370 //------------------------------------------------------------------------ addValToSets(List<RBBINode> sets, int val)371 void addValToSets(List<RBBINode> sets, int val) { 372 for (RBBINode usetNode : sets) { 373 addValToSet(usetNode, val); 374 } 375 } 376 addValToSet(RBBINode usetNode, int val)377 void addValToSet(RBBINode usetNode, int val) { 378 RBBINode leafNode = new RBBINode(RBBINode.leafChar); 379 leafNode.fVal = val; 380 if (usetNode.fLeftChild == null) { 381 usetNode.fLeftChild = leafNode; 382 leafNode.fParent = usetNode; 383 } else { 384 // There are already input symbols present for this set. 385 // Set up an OR node, with the previous stuff as the left child 386 // and the new value as the right child. 387 RBBINode orNode = new RBBINode(RBBINode.opOr); 388 orNode.fLeftChild = usetNode.fLeftChild; 389 orNode.fRightChild = leafNode; 390 orNode.fLeftChild.fParent = orNode; 391 orNode.fRightChild.fParent = orNode; 392 usetNode.fLeftChild = orNode; 393 orNode.fParent = usetNode; 394 } 395 } 396 397 398 //------------------------------------------------------------------------ 399 // 400 // getNumCharCategories 401 // 402 //------------------------------------------------------------------------ getNumCharCategories()403 int getNumCharCategories() { 404 return fGroupCount + 3; 405 } 406 407 408 //------------------------------------------------------------------------ 409 // 410 // sawBOF 411 // 412 //------------------------------------------------------------------------ sawBOF()413 boolean sawBOF() { 414 return fSawBOF; 415 } 416 417 418 //------------------------------------------------------------------------ 419 // 420 // getFirstChar Given a runtime RBBI character category, find 421 // the first UChar32 that is in the set of chars 422 // in the category. 423 //------------------------------------------------------------------------ getFirstChar(int category)424 int getFirstChar(int category) { 425 RangeDescriptor rlRange; 426 int retVal = -1; 427 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 428 if (rlRange.fNum == category) { 429 retVal = rlRange.fStartChar; 430 break; 431 } 432 } 433 return retVal; 434 } 435 436 437 438 //------------------------------------------------------------------------ 439 // 440 // printRanges A debugging function. 441 // dump out all of the range definitions. 442 // 443 //------------------------------------------------------------------------ 444 ///CLOVER:OFF printRanges()445 void printRanges() { 446 RangeDescriptor rlRange; 447 int i; 448 449 System.out.print("\n\n Nonoverlapping Ranges ...\n"); 450 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 451 System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar); 452 453 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 454 RBBINode usetNode = rlRange.fIncludesSets.get(i); 455 String setName = "anon"; 456 RBBINode setRef = usetNode.fParent; 457 if (setRef != null) { 458 RBBINode varRef = setRef.fParent; 459 if (varRef != null && varRef.fType == RBBINode.varRef) { 460 setName = varRef.fText; 461 } 462 } 463 System.out.print(setName); System.out.print(" "); 464 } 465 System.out.println(""); 466 } 467 } 468 ///CLOVER:ON 469 470 471 //------------------------------------------------------------------------ 472 // 473 // printRangeGroups A debugging function. 474 // dump out all of the range groups. 475 // 476 //------------------------------------------------------------------------ 477 ///CLOVER:OFF printRangeGroups()478 void printRangeGroups() { 479 RangeDescriptor rlRange; 480 RangeDescriptor tRange; 481 int i; 482 int lastPrintedGroupNum = 0; 483 484 System.out.print("\nRanges grouped by Unicode Set Membership...\n"); 485 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 486 int groupNum = rlRange.fNum & 0xbfff; 487 if (groupNum > lastPrintedGroupNum) { 488 lastPrintedGroupNum = groupNum; 489 if (groupNum<10) {System.out.print(" ");} 490 System.out.print(groupNum + " "); 491 492 if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");} 493 494 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 495 RBBINode usetNode = rlRange.fIncludesSets.get(i); 496 String setName = "anon"; 497 RBBINode setRef = usetNode.fParent; 498 if (setRef != null) { 499 RBBINode varRef = setRef.fParent; 500 if (varRef != null && varRef.fType == RBBINode.varRef) { 501 setName = varRef.fText; 502 } 503 } 504 System.out.print(setName); System.out.print(" "); 505 } 506 507 i = 0; 508 for (tRange = rlRange; tRange != null; tRange = tRange.fNext) { 509 if (tRange.fNum == rlRange.fNum) { 510 if (i++ % 5 == 0) { 511 System.out.print("\n "); 512 } 513 RBBINode.printHex(tRange.fStartChar, -1); 514 System.out.print("-"); 515 RBBINode.printHex(tRange.fEndChar, 0); 516 } 517 } 518 System.out.print("\n"); 519 } 520 } 521 System.out.print("\n"); 522 } 523 ///CLOVER:ON 524 525 526 //------------------------------------------------------------------------ 527 // 528 // printSets A debugging function. 529 // dump out all of the set definitions. 530 // 531 //------------------------------------------------------------------------ 532 ///CLOVER:OFF printSets()533 void printSets() { 534 int i; 535 System.out.print("\n\nUnicode Sets List\n------------------\n"); 536 for (i=0; i<fRB.fUSetNodes.size(); i++) { 537 RBBINode usetNode; 538 RBBINode setRef; 539 RBBINode varRef; 540 String setName; 541 542 usetNode = fRB.fUSetNodes.get(i); 543 544 //System.out.print(" " + i + " "); 545 RBBINode.printInt(2, i); 546 setName = "anonymous"; 547 setRef = usetNode.fParent; 548 if (setRef != null) { 549 varRef = setRef.fParent; 550 if (varRef != null && varRef.fType == RBBINode.varRef) { 551 setName = varRef.fText; 552 } 553 } 554 System.out.print(" " + setName); 555 System.out.print(" "); 556 System.out.print(usetNode.fText); 557 System.out.print("\n"); 558 if (usetNode.fLeftChild != null) { 559 usetNode.fLeftChild.printTree(true); 560 } 561 } 562 System.out.print("\n"); 563 } 564 ///CLOVER:ON 565 } 566