1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2003-2011, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.io.ByteArrayOutputStream; 12 import java.io.IOException; 13 import java.io.OutputStream; 14 import java.util.ArrayList; 15 import java.util.List; 16 17 import com.ibm.icu.impl.Assert; 18 import com.ibm.icu.text.RBBIRuleBuilder.IntPair; 19 import com.ibm.icu.util.CodePointTrie; 20 import com.ibm.icu.util.MutableCodePointTrie; 21 22 // 23 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 24 // (part of the rule building process.) 25 // 26 // Starting with the rules parse tree from the scanner, 27 // 28 // - Enumerate the set of UnicodeSets that are referenced 29 // by the RBBI rules. 30 // - compute a set of non-overlapping character ranges 31 // with all characters within a range belonging to the same 32 // set of input unicode sets. 33 // - Derive a set of non-overlapping UnicodeSet (like things) 34 // that will correspond to columns in the state table for 35 // the RBBI execution engine. All characters within one 36 // of these sets belong to the same set of the original 37 // UnicodeSets from the user's rules. 38 // - construct the trie table that maps input characters 39 // to the index of the matching non-overlapping set of set from 40 // the previous step. 41 // 42 class RBBISetBuilder { 43 static class RangeDescriptor { 44 int fStartChar = 0; // Start of range, unicode 32 bit value. 45 int fEndChar = 0; // End of range, unicode 32 bit value. 46 int fNum = 0; // runtime-mapped input value for this range. 47 boolean fIncludesDict = false; // True if the range includes $dictionary. 48 boolean fFirstInGroup = false; // True if first range in a group with the same fNum. 49 List<RBBINode> fIncludesSets; // vector of the the original 50 // Unicode sets that include this range. 51 // (Contains ptrs to uset nodes) 52 RangeDescriptor fNext; // Next RangeDescriptor in the linked list. 53 RangeDescriptor()54 RangeDescriptor() { 55 fIncludesSets = new ArrayList<>(); 56 } 57 RangeDescriptor(RangeDescriptor other)58 RangeDescriptor(RangeDescriptor other) { 59 fStartChar = other.fStartChar; 60 fEndChar = other.fEndChar; 61 fNum = other.fNum; 62 fIncludesDict = other.fIncludesDict; 63 fFirstInGroup = other.fFirstInGroup; 64 fIncludesSets = new ArrayList<>(other.fIncludesSets); 65 } 66 67 //------------------------------------------------------------------------------------- 68 // 69 // RangeDesriptor::split() 70 // 71 //------------------------------------------------------------------------------------- split(int where)72 void split(int where) { 73 Assert.assrt(where>fStartChar && where<=fEndChar); 74 RangeDescriptor nr = new RangeDescriptor(this); 75 76 // RangeDescriptor copy constructor copies all fields. 77 // Only need to update those that are different after the split. 78 nr.fStartChar = where; 79 this.fEndChar = where-1; 80 nr.fNext = this.fNext; 81 this.fNext = nr; 82 83 // TODO: fIncludesSets is not updated. Check it out. 84 // Probably because they haven't been populated yet, 85 // but still sloppy. 86 } 87 88 89 /** 90 * Test whether this range includes characters from the original Unicode Set named "dictionary". 91 * 92 * This function looks through the Unicode Sets that 93 * the range includes, checking for one named "dictionary" 94 */ 95 // TODO: a faster way would be to find the set node for 96 // "dictionary" just once, rather than looking it 97 // up by name every time. 98 // isDictionaryRange()99 boolean isDictionaryRange() { 100 for (int i=0; i<this.fIncludesSets.size(); i++) { 101 RBBINode usetNode = fIncludesSets.get(i); 102 String setName = ""; 103 RBBINode setRef = usetNode.fParent; 104 if (setRef != null) { 105 RBBINode varRef = setRef.fParent; 106 if (varRef != null && varRef.fType == RBBINode.varRef) { 107 setName = varRef.fText; 108 } 109 } 110 if (setName.equals("dictionary")) { 111 return true; 112 } 113 } 114 return false; 115 } 116 } 117 118 119 RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us. 120 RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors 121 122 MutableCodePointTrie fTrie; // The mapping TRIE that is the end result of processing 123 // the Unicode Sets. 124 CodePointTrie fFrozenTrie; 125 126 /** 127 * Number of range groups, which are groups of ranges that are in the same original UnicodeSets. 128 */ 129 int fGroupCount; 130 /** 131 * The number of the first dictionary char category. 132 * If there are no Dictionary categories, set to the last category + 1. 133 */ 134 int fDictCategoriesStart; 135 136 boolean fSawBOF; 137 138 139 //------------------------------------------------------------------------ 140 // 141 // RBBISetBuilder Constructor 142 // 143 //------------------------------------------------------------------------ RBBISetBuilder(RBBIRuleBuilder rb)144 RBBISetBuilder(RBBIRuleBuilder rb) 145 { 146 fRB = rb; 147 } 148 149 150 //------------------------------------------------------------------------ 151 // 152 // build Build the list of non-overlapping character ranges 153 // from the Unicode Sets. 154 // 155 //------------------------------------------------------------------------ buildRanges()156 void buildRanges() { 157 RangeDescriptor rlRange; 158 159 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();} 160 161 // Initialize the process by creating a single range encompassing all characters 162 // that is in no sets. 163 // 164 fRangeList = new RangeDescriptor(); 165 fRangeList.fStartChar = 0; 166 fRangeList.fEndChar = 0x10ffff; 167 168 // 169 // Find the set of non-overlapping ranges of characters 170 // 171 for (RBBINode usetNode : fRB.fUSetNodes) { 172 UnicodeSet inputSet = usetNode.fInputSet; 173 int inputSetRangeCount = inputSet.getRangeCount(); 174 int inputSetRangeIndex = 0; 175 rlRange = fRangeList; 176 177 for (;;) { 178 if (inputSetRangeIndex >= inputSetRangeCount) { 179 break; 180 } 181 int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex); 182 int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex); 183 184 // skip over ranges from the range list that are completely 185 // below the current range from the input unicode set. 186 while (rlRange.fEndChar < inputSetRangeBegin) { 187 rlRange = rlRange.fNext; 188 } 189 190 // If the start of the range from the range list is before with 191 // the start of the range from the unicode set, split the range list range 192 // in two, with one part being before (wholly outside of) the unicode set 193 // and the other containing the rest. 194 // Then continue the loop; the post-split current range will then be skipped 195 // over 196 if (rlRange.fStartChar < inputSetRangeBegin) { 197 rlRange.split(inputSetRangeBegin); 198 continue; 199 } 200 201 // Same thing at the end of the ranges... 202 // If the end of the range from the range list doesn't coincide with 203 // the end of the range from the unicode set, split the range list 204 // range in two. The first part of the split range will be 205 // wholly inside the Unicode set. 206 if (rlRange.fEndChar > inputSetRangeEnd) { 207 rlRange.split(inputSetRangeEnd+1); 208 } 209 210 // The current rlRange is now entirely within the UnicodeSet range. 211 // Add this unicode set to the list of sets for this rlRange 212 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) { 213 rlRange.fIncludesSets.add(usetNode); 214 } 215 216 // Advance over ranges that we are finished with. 217 if (inputSetRangeEnd == rlRange.fEndChar) { 218 inputSetRangeIndex++; 219 } 220 rlRange = rlRange.fNext; 221 } 222 } 223 224 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();} 225 226 // 227 // Group the above ranges, with each group consisting of one or more 228 // ranges that are in exactly the same set of original UnicodeSets. 229 // The groups are numbered, and these group numbers are the set of 230 // input symbols recognized by the run-time state machine. 231 // 232 // Numbering: # 0 (state table column 0) is unused. 233 // # 1 is reserved - table column 1 is for end-of-input 234 // # 2 is reserved - table column 2 is for beginning-of-input 235 // # 3 is the first range list. 236 // 237 RangeDescriptor rlSearchRange; 238 int dictGroupCount = 0; 239 240 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 241 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) { 242 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) { 243 rlRange.fNum = rlSearchRange.fNum; 244 rlRange.fIncludesDict = rlSearchRange.fIncludesDict; 245 break; 246 } 247 } 248 if (rlRange.fNum == 0) { 249 rlRange.fFirstInGroup = true; 250 if (rlRange.isDictionaryRange()) { 251 rlRange.fNum = ++dictGroupCount; 252 rlRange.fIncludesDict = true; 253 } else { 254 fGroupCount++; 255 rlRange.fNum = fGroupCount + 2; 256 addValToSets(rlRange.fIncludesSets, fGroupCount + 2); 257 } 258 } 259 } 260 261 // Move the character category numbers for any dictionary ranges up, so that they 262 // immediately follow the non-dictionary ranges. 263 264 fDictCategoriesStart = fGroupCount + 3; 265 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 266 if (rlRange.fIncludesDict) { 267 rlRange.fNum += fDictCategoriesStart - 1; 268 if (rlRange.fFirstInGroup) { 269 addValToSets(rlRange.fIncludesSets, rlRange.fNum); 270 } 271 } 272 } 273 fGroupCount += dictGroupCount; 274 275 276 277 // Handle input sets that contain the special string {eof}. 278 // Column 1 of the state table is reserved for EOF on input. 279 // Column 2 is reserved for before-the-start-input. 280 // (This column can be optimized away later if there are no rule 281 // references to {bof}.) 282 // Add this column value (1 or 2) to the equivalent expression 283 // subtree for each UnicodeSet that contains the string {eof} 284 // Because {bof} and {eof} are not a characters in the normal sense, 285 // they doesn't affect the computation of ranges or TRIE. 286 287 String eofString = "eof"; 288 String bofString = "bof"; 289 290 for (RBBINode usetNode : fRB.fUSetNodes) { 291 UnicodeSet inputSet = usetNode.fInputSet; 292 if (inputSet.contains(eofString)) { 293 addValToSet(usetNode, 1); 294 } 295 if (inputSet.contains(bofString)) { 296 addValToSet(usetNode, 2); 297 fSawBOF = true; 298 } 299 } 300 301 302 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} 303 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} 304 } 305 306 307 private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 255; 308 309 /** 310 * Build the Trie table for mapping UChar32 values to the corresponding 311 * range group number. 312 */ buildTrie()313 void buildTrie() { 314 fTrie = new MutableCodePointTrie(0, // Initial value for all code points. 315 0); // Error value for out-of-range input. 316 317 for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 318 fTrie.setRange(rlRange.fStartChar, // Range start 319 rlRange.fEndChar, // Range end (inclusive) 320 rlRange.fNum // value for range 321 ); 322 } 323 } 324 325 /** 326 * Merge two character categories that have been identified as having equivalent behavior. 327 * The ranges belonging to the second category (table column) will be added to the first. 328 * @param categories the pair of categories to be merged. 329 */ mergeCategories(IntPair categories)330 void mergeCategories(IntPair categories) { 331 assert(categories.first >= 1); 332 assert(categories.second > categories.first); 333 assert((categories.first < fDictCategoriesStart && categories.second < fDictCategoriesStart) || 334 (categories.first >= fDictCategoriesStart && categories.second >= fDictCategoriesStart)); 335 for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) { 336 int rangeNum = rd.fNum; 337 if (rangeNum == categories.second) { 338 rd.fNum = categories.first; 339 } else if (rangeNum > categories.second) { 340 rd.fNum--; 341 } 342 } 343 --fGroupCount; 344 if (categories.second <= fDictCategoriesStart) { 345 --fDictCategoriesStart; 346 } 347 } 348 349 //----------------------------------------------------------------------------------- 350 // 351 // freezeTrieIfNotYet() Ensure the trie is frozen. Shared code by getTrieSize 352 // and serializeTrie. 353 // 354 //----------------------------------------------------------------------------------- freezeTrieIfNotYet()355 void freezeTrieIfNotYet() { 356 if (fFrozenTrie == null) { 357 boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE; 358 fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST, 359 use8Bits ? 360 CodePointTrie.ValueWidth.BITS_8 : 361 CodePointTrie.ValueWidth.BITS_16); 362 fTrie = null; 363 } 364 } 365 366 //----------------------------------------------------------------------------------- 367 // 368 // getTrieSize() Return the size that will be required to serialize the Trie. 369 // 370 //----------------------------------------------------------------------------------- getTrieSize()371 int getTrieSize() { 372 freezeTrieIfNotYet(); 373 return fFrozenTrie.toBinary(new ByteArrayOutputStream()); 374 } 375 376 377 //----------------------------------------------------------------------------------- 378 // 379 // serializeTrie() Write the serialized trie to an output stream 380 // 381 //----------------------------------------------------------------------------------- serializeTrie(OutputStream os)382 void serializeTrie(OutputStream os) throws IOException { 383 freezeTrieIfNotYet(); 384 fFrozenTrie.toBinary(os); 385 } 386 387 //------------------------------------------------------------------------ 388 // 389 // addValToSets Add a runtime-mapped input value to each uset from a 390 // list of uset nodes. (val corresponds to a state table column.) 391 // For each of the original Unicode sets - which correspond 392 // directly to uset nodes - a logically equivalent expression 393 // is constructed in terms of the remapped runtime input 394 // symbol set. This function adds one runtime input symbol to 395 // a list of sets. 396 // 397 // The "logically equivalent expression" is the tree for an 398 // or-ing together of all of the symbols that go into the set. 399 // 400 //------------------------------------------------------------------------ addValToSets(List<RBBINode> sets, int val)401 void addValToSets(List<RBBINode> sets, int val) { 402 for (RBBINode usetNode : sets) { 403 addValToSet(usetNode, val); 404 } 405 } 406 addValToSet(RBBINode usetNode, int val)407 void addValToSet(RBBINode usetNode, int val) { 408 RBBINode leafNode = new RBBINode(RBBINode.leafChar); 409 leafNode.fVal = val; 410 if (usetNode.fLeftChild == null) { 411 usetNode.fLeftChild = leafNode; 412 leafNode.fParent = usetNode; 413 } else { 414 // There are already input symbols present for this set. 415 // Set up an OR node, with the previous stuff as the left child 416 // and the new value as the right child. 417 RBBINode orNode = new RBBINode(RBBINode.opOr); 418 orNode.fLeftChild = usetNode.fLeftChild; 419 orNode.fRightChild = leafNode; 420 orNode.fLeftChild.fParent = orNode; 421 orNode.fRightChild.fParent = orNode; 422 usetNode.fLeftChild = orNode; 423 orNode.fParent = usetNode; 424 } 425 } 426 427 428 //------------------------------------------------------------------------ 429 // 430 // getNumCharCategories 431 // 432 //------------------------------------------------------------------------ getNumCharCategories()433 int getNumCharCategories() { 434 return fGroupCount + 3; 435 } 436 437 438 //------------------------------------------------------------------------ 439 // 440 // getDictCategoriesStart 441 // 442 //------------------------------------------------------------------------ getDictCategoriesStart()443 int getDictCategoriesStart() { 444 return fDictCategoriesStart; 445 } 446 447 448 //------------------------------------------------------------------------ 449 // 450 // sawBOF 451 // 452 //------------------------------------------------------------------------ sawBOF()453 boolean sawBOF() { 454 return fSawBOF; 455 } 456 457 458 //------------------------------------------------------------------------ 459 // 460 // getFirstChar Given a runtime RBBI character category, find 461 // the first UChar32 that is in the set of chars 462 // in the category. 463 //------------------------------------------------------------------------ getFirstChar(int category)464 int getFirstChar(int category) { 465 RangeDescriptor rlRange; 466 int retVal = -1; 467 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 468 if (rlRange.fNum == category) { 469 retVal = rlRange.fStartChar; 470 break; 471 } 472 } 473 return retVal; 474 } 475 476 477 //------------------------------------------------------------------------ 478 // 479 // printRanges A debugging function. 480 // dump out all of the range definitions. 481 // 482 //------------------------------------------------------------------------ 483 ///CLOVER:OFF printRanges()484 void printRanges() { 485 RangeDescriptor rlRange; 486 int i; 487 488 System.out.print("\n\n Nonoverlapping Ranges ...\n"); 489 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 490 System.out.printf("%04x-%04x ", rlRange.fStartChar, rlRange.fEndChar); 491 492 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 493 RBBINode usetNode = rlRange.fIncludesSets.get(i); 494 String setName = "anon"; 495 RBBINode setRef = usetNode.fParent; 496 if (setRef != null) { 497 RBBINode varRef = setRef.fParent; 498 if (varRef != null && varRef.fType == RBBINode.varRef) { 499 setName = varRef.fText; 500 } 501 } 502 System.out.print(setName); System.out.print(" "); 503 } 504 System.out.println(""); 505 } 506 } 507 ///CLOVER:ON 508 509 510 //------------------------------------------------------------------------ 511 // 512 // printRangeGroups A debugging function. 513 // dump out all of the range groups. 514 // 515 //------------------------------------------------------------------------ 516 ///CLOVER:OFF printRangeGroups()517 void printRangeGroups() { 518 int i; 519 520 System.out.print("\nRanges grouped by Unicode Set Membership...\n"); 521 for (RangeDescriptor rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 522 if (rlRange.fFirstInGroup) { 523 int groupNum = rlRange.fNum; 524 if (groupNum<10) {System.out.print(" ");} 525 System.out.print(groupNum + " "); 526 527 if (groupNum >= fDictCategoriesStart) { System.out.print(" <DICT> ");} 528 529 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 530 RBBINode usetNode = rlRange.fIncludesSets.get(i); 531 String setName = "anon"; 532 RBBINode setRef = usetNode.fParent; 533 if (setRef != null) { 534 RBBINode varRef = setRef.fParent; 535 if (varRef != null && varRef.fType == RBBINode.varRef) { 536 setName = varRef.fText; 537 } 538 } 539 System.out.print(setName); System.out.print(" "); 540 } 541 542 i = 0; 543 for (RangeDescriptor tRange = rlRange; tRange != null; tRange = tRange.fNext) { 544 if (tRange.fNum == rlRange.fNum) { 545 if (i++ % 5 == 0) { 546 System.out.print("\n "); 547 } 548 RBBINode.printHex(tRange.fStartChar, -1); 549 System.out.print("-"); 550 RBBINode.printHex(tRange.fEndChar, 0); 551 } 552 } 553 System.out.print("\n"); 554 } 555 } 556 System.out.print("\n"); 557 } 558 ///CLOVER:ON 559 560 561 //------------------------------------------------------------------------ 562 // 563 // printSets A debugging function. 564 // dump out all of the set definitions. 565 // 566 //------------------------------------------------------------------------ 567 ///CLOVER:OFF printSets()568 void printSets() { 569 int i; 570 System.out.print("\n\nUnicode Sets List\n------------------\n"); 571 for (i=0; i<fRB.fUSetNodes.size(); i++) { 572 RBBINode usetNode; 573 RBBINode setRef; 574 RBBINode varRef; 575 String setName; 576 577 usetNode = fRB.fUSetNodes.get(i); 578 579 //System.out.print(" " + i + " "); 580 RBBINode.printInt(2, i); 581 setName = "anonymous"; 582 setRef = usetNode.fParent; 583 if (setRef != null) { 584 varRef = setRef.fParent; 585 if (varRef != null && varRef.fType == RBBINode.varRef) { 586 setName = varRef.fText; 587 } 588 } 589 System.out.print(" " + setName); 590 System.out.print(" "); 591 System.out.print(usetNode.fText); 592 System.out.print("\n"); 593 if (usetNode.fLeftChild != null) { 594 usetNode.fLeftChild.printTree(true); 595 } 596 } 597 System.out.print("\n"); 598 } 599 ///CLOVER:ON 600 } 601