1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 /* 3 ******************************************************************************* 4 * Copyright (C) 2003-2011, International Business Machines Corporation and * 5 * others. All Rights Reserved. * 6 ******************************************************************************* 7 */ 8 package android.icu.text; 9 10 import java.io.IOException; 11 import java.io.OutputStream; 12 import java.util.ArrayList; 13 import java.util.List; 14 15 import android.icu.impl.Assert; 16 import android.icu.impl.IntTrieBuilder; 17 18 // 19 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules 20 // (part of the rule building process.) 21 // 22 // Starting with the rules parse tree from the scanner, 23 // 24 // - Enumerate the set of UnicodeSets that are referenced 25 // by the RBBI rules. 26 // - compute a set of non-overlapping character ranges 27 // with all characters within a range belonging to the same 28 // set of input uniocde sets. 29 // - Derive a set of non-overlapping UnicodeSet (like things) 30 // that will correspond to columns in the state table for 31 // the RBBI execution engine. All characters within one 32 // of these sets belong to the same set of the original 33 // UnicodeSets from the user's rules. 34 // - construct the trie table that maps input characters 35 // to the index of the matching non-overlapping set of set from 36 // the previous step. 37 // 38 class RBBISetBuilder { 39 static class RangeDescriptor { 40 int fStartChar; // Start of range, unicode 32 bit value. 41 int fEndChar; // End of range, unicode 32 bit value. 42 int fNum; // runtime-mapped input value for this range. 43 List<RBBINode> fIncludesSets; // vector of the the original 44 // Unicode sets that include this range. 45 // (Contains ptrs to uset nodes) 46 RangeDescriptor fNext; // Next RangeDescriptor in the linked list. 47 RangeDescriptor()48 RangeDescriptor() { 49 fIncludesSets = new ArrayList<RBBINode>(); 50 } 51 RangeDescriptor(RangeDescriptor other)52 RangeDescriptor(RangeDescriptor other) { 53 fStartChar = other.fStartChar; 54 fEndChar = other.fEndChar; 55 fNum = other.fNum; 56 fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets); 57 } 58 59 //------------------------------------------------------------------------------------- 60 // 61 // RangeDesriptor::split() 62 // 63 //------------------------------------------------------------------------------------- split(int where)64 void split(int where) { 65 Assert.assrt(where>fStartChar && where<=fEndChar); 66 RangeDescriptor nr = new RangeDescriptor(this); 67 68 // RangeDescriptor copy constructor copies all fields. 69 // Only need to update those that are different after the split. 70 nr.fStartChar = where; 71 this.fEndChar = where-1; 72 nr.fNext = this.fNext; 73 this.fNext = nr; 74 75 // TODO: fIncludesSets is not updated. Check it out. 76 // Probably because they haven't been populated yet, 77 // but still sloppy. 78 } 79 80 81 //------------------------------------------------------------------------------------- 82 // 83 // RangeDescriptor::setDictionaryFlag 84 // 85 // Character Category Numbers that include characters from 86 // the original Unicode Set named "dictionary" have bit 14 87 // set to 1. The RBBI runtime engine uses this to trigger 88 // use of the word dictionary. 89 // 90 // This function looks through the Unicode Sets that it 91 // (the range) includes, and sets the bit in fNum when 92 // "dictionary" is among them. 93 // 94 // TODO: a faster way would be to find the set node for 95 // "dictionary" just once, rather than looking it 96 // up by name every time. 97 // 98 // ------------------------------------------------------------------------------------- setDictionaryFlag()99 void setDictionaryFlag() { 100 int i; 101 102 for (i=0; i<this.fIncludesSets.size(); i++) { 103 RBBINode usetNode = fIncludesSets.get(i); 104 String setName = ""; 105 RBBINode setRef = usetNode.fParent; 106 if (setRef != null) { 107 RBBINode varRef = setRef.fParent; 108 if (varRef != null && varRef.fType == RBBINode.varRef) { 109 setName = varRef.fText; 110 } 111 } 112 if (setName.equals("dictionary")) { 113 this.fNum |= 0x4000; 114 break; 115 } 116 } 117 118 } 119 } 120 121 122 RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us. 123 RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors 124 125 IntTrieBuilder fTrie; // The mapping TRIE that is the end result of processing 126 int fTrieSize; // the Unicode Sets. 127 128 // Groups correspond to character categories - 129 // groups of ranges that are in the same original UnicodeSets. 130 // fGroupCount is the index of the last used group. 131 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. 132 // State table column 0 is not used. Column 1 is for end-of-input. 133 // column 2 is for group 0. Funny counting. 134 int fGroupCount; 135 136 boolean fSawBOF; 137 138 139 //------------------------------------------------------------------------ 140 // 141 // RBBISetBuilder Constructor 142 // 143 //------------------------------------------------------------------------ RBBISetBuilder(RBBIRuleBuilder rb)144 RBBISetBuilder(RBBIRuleBuilder rb) 145 { 146 fRB = rb; 147 } 148 149 150 //------------------------------------------------------------------------ 151 // 152 // build Build the list of non-overlapping character ranges 153 // from the Unicode Sets. 154 // 155 //------------------------------------------------------------------------ build()156 void build() { 157 RangeDescriptor rlRange; 158 159 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();} 160 161 // Initialize the process by creating a single range encompassing all characters 162 // that is in no sets. 163 // 164 fRangeList = new RangeDescriptor(); 165 fRangeList.fStartChar = 0; 166 fRangeList.fEndChar = 0x10ffff; 167 168 // 169 // Find the set of non-overlapping ranges of characters 170 // 171 for (RBBINode usetNode : fRB.fUSetNodes) { 172 UnicodeSet inputSet = usetNode.fInputSet; 173 int inputSetRangeCount = inputSet.getRangeCount(); 174 int inputSetRangeIndex = 0; 175 rlRange = fRangeList; 176 177 for (;;) { 178 if (inputSetRangeIndex >= inputSetRangeCount) { 179 break; 180 } 181 int inputSetRangeBegin = inputSet.getRangeStart(inputSetRangeIndex); 182 int inputSetRangeEnd = inputSet.getRangeEnd(inputSetRangeIndex); 183 184 // skip over ranges from the range list that are completely 185 // below the current range from the input unicode set. 186 while (rlRange.fEndChar < inputSetRangeBegin) { 187 rlRange = rlRange.fNext; 188 } 189 190 // If the start of the range from the range list is before with 191 // the start of the range from the unicode set, split the range list range 192 // in two, with one part being before (wholly outside of) the unicode set 193 // and the other containing the rest. 194 // Then continue the loop; the post-split current range will then be skipped 195 // over 196 if (rlRange.fStartChar < inputSetRangeBegin) { 197 rlRange.split(inputSetRangeBegin); 198 continue; 199 } 200 201 // Same thing at the end of the ranges... 202 // If the end of the range from the range list doesn't coincide with 203 // the end of the range from the unicode set, split the range list 204 // range in two. The first part of the split range will be 205 // wholly inside the Unicode set. 206 if (rlRange.fEndChar > inputSetRangeEnd) { 207 rlRange.split(inputSetRangeEnd+1); 208 } 209 210 // The current rlRange is now entirely within the UnicodeSet range. 211 // Add this unicode set to the list of sets for this rlRange 212 if (rlRange.fIncludesSets.indexOf(usetNode) == -1) { 213 rlRange.fIncludesSets.add(usetNode); 214 } 215 216 // Advance over ranges that we are finished with. 217 if (inputSetRangeEnd == rlRange.fEndChar) { 218 inputSetRangeIndex++; 219 } 220 rlRange = rlRange.fNext; 221 } 222 } 223 224 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();} 225 226 // 227 // Group the above ranges, with each group consisting of one or more 228 // ranges that are in exactly the same set of original UnicodeSets. 229 // The groups are numbered, and these group numbers are the set of 230 // input symbols recognized by the run-time state machine. 231 // 232 // Numbering: # 0 (state table column 0) is unused. 233 // # 1 is reserved - table column 1 is for end-of-input 234 // # 2 is reserved - table column 2 is for beginning-in-input 235 // # 3 is the first range list. 236 // 237 RangeDescriptor rlSearchRange; 238 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 239 for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) { 240 if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) { 241 rlRange.fNum = rlSearchRange.fNum; 242 break; 243 } 244 } 245 if (rlRange.fNum == 0) { 246 fGroupCount ++; 247 rlRange.fNum = fGroupCount+2; 248 rlRange.setDictionaryFlag(); 249 addValToSets(rlRange.fIncludesSets, fGroupCount+2); 250 } 251 } 252 253 // Handle input sets that contain the special string {eof}. 254 // Column 1 of the state table is reserved for EOF on input. 255 // Column 2 is reserved for before-the-start-input. 256 // (This column can be optimized away later if there are no rule 257 // references to {bof}.) 258 // Add this column value (1 or 2) to the equivalent expression 259 // subtree for each UnicodeSet that contains the string {eof} 260 // Because {bof} and {eof} are not a characters in the normal sense, 261 // they doesn't affect the computation of ranges or TRIE. 262 263 String eofString = "eof"; 264 String bofString = "bof"; 265 266 for (RBBINode usetNode : fRB.fUSetNodes) { 267 UnicodeSet inputSet = usetNode.fInputSet; 268 if (inputSet.contains(eofString)) { 269 addValToSet(usetNode, 1); 270 } 271 if (inputSet.contains(bofString)) { 272 addValToSet(usetNode, 2); 273 fSawBOF = true; 274 } 275 } 276 277 278 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();} 279 if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();} 280 281 282 //IntTrieBuilder(int aliasdata[], int maxdatalength, 283 // int initialvalue, int leadunitvalue, 284 // boolean latin1linear) 285 286 fTrie = new IntTrieBuilder(null, // Data array (utrie will allocate one) 287 100000, // Max Data Length 288 0, // Initial value for all code points 289 0, // Lead Surrogate unit value, 290 true); // Keep Latin 1 in separately. 291 292 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 293 fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true); 294 } 295 } 296 297 298 299 //----------------------------------------------------------------------------------- 300 // 301 // RBBIDataManipulate A little internal class needed only to wrap of the 302 // getFoldedValue() function needed for Trie table creation. 303 // 304 //----------------------------------------------------------------------------------- 305 class RBBIDataManipulate implements IntTrieBuilder.DataManipulate { getFoldedValue(int start, int offset)306 public int getFoldedValue(int start, int offset) { 307 int value; 308 int limit; 309 boolean [] inBlockZero = new boolean[1]; 310 311 limit = start + 0x400; 312 while(start<limit) { 313 value = fTrie.getValue(start, inBlockZero); 314 if (inBlockZero[0]) { 315 start += IntTrieBuilder.DATA_BLOCK_LENGTH; 316 } else if (value != 0) { 317 return offset | 0x08000; 318 } else { 319 ++start; 320 } 321 } 322 return 0; 323 } 324 } 325 RBBIDataManipulate dm = new RBBIDataManipulate(); 326 327 //----------------------------------------------------------------------------------- 328 // 329 // getTrieSize() Return the size that will be required to serialize the Trie. 330 // 331 //----------------------------------------------------------------------------------- getTrieSize()332 int getTrieSize() { 333 int size = 0; 334 try { 335 // The trie serialize function returns the size of the data written. 336 // null output stream says give size only, don't actually write anything. 337 size = fTrie.serialize(null, true, dm ); 338 } catch (IOException e) { 339 Assert.assrt (false); 340 } 341 return size; 342 } 343 344 345 //----------------------------------------------------------------------------------- 346 // 347 // serializeTrie() Write the serialized trie to an output stream 348 // 349 //----------------------------------------------------------------------------------- serializeTrie(OutputStream os)350 void serializeTrie(OutputStream os) throws IOException { 351 fTrie.serialize(os, true, dm ); 352 } 353 354 //------------------------------------------------------------------------ 355 // 356 // addValToSets Add a runtime-mapped input value to each uset from a 357 // list of uset nodes. (val corresponds to a state table column.) 358 // For each of the original Unicode sets - which correspond 359 // directly to uset nodes - a logically equivalent expression 360 // is constructed in terms of the remapped runtime input 361 // symbol set. This function adds one runtime input symbol to 362 // a list of sets. 363 // 364 // The "logically equivalent expression" is the tree for an 365 // or-ing together of all of the symbols that go into the set. 366 // 367 //------------------------------------------------------------------------ addValToSets(List<RBBINode> sets, int val)368 void addValToSets(List<RBBINode> sets, int val) { 369 for (RBBINode usetNode : sets) { 370 addValToSet(usetNode, val); 371 } 372 } 373 addValToSet(RBBINode usetNode, int val)374 void addValToSet(RBBINode usetNode, int val) { 375 RBBINode leafNode = new RBBINode(RBBINode.leafChar); 376 leafNode.fVal = val; 377 if (usetNode.fLeftChild == null) { 378 usetNode.fLeftChild = leafNode; 379 leafNode.fParent = usetNode; 380 } else { 381 // There are already input symbols present for this set. 382 // Set up an OR node, with the previous stuff as the left child 383 // and the new value as the right child. 384 RBBINode orNode = new RBBINode(RBBINode.opOr); 385 orNode.fLeftChild = usetNode.fLeftChild; 386 orNode.fRightChild = leafNode; 387 orNode.fLeftChild.fParent = orNode; 388 orNode.fRightChild.fParent = orNode; 389 usetNode.fLeftChild = orNode; 390 orNode.fParent = usetNode; 391 } 392 } 393 394 395 //------------------------------------------------------------------------ 396 // 397 // getNumCharCategories 398 // 399 //------------------------------------------------------------------------ getNumCharCategories()400 int getNumCharCategories() { 401 return fGroupCount + 3; 402 } 403 404 405 //------------------------------------------------------------------------ 406 // 407 // sawBOF 408 // 409 //------------------------------------------------------------------------ sawBOF()410 boolean sawBOF() { 411 return fSawBOF; 412 } 413 414 415 //------------------------------------------------------------------------ 416 // 417 // getFirstChar Given a runtime RBBI character category, find 418 // the first UChar32 that is in the set of chars 419 // in the category. 420 //------------------------------------------------------------------------ getFirstChar(int category)421 int getFirstChar(int category) { 422 RangeDescriptor rlRange; 423 int retVal = -1; 424 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 425 if (rlRange.fNum == category) { 426 retVal = rlRange.fStartChar; 427 break; 428 } 429 } 430 return retVal; 431 } 432 433 434 435 //------------------------------------------------------------------------ 436 // 437 // printRanges A debugging function. 438 // dump out all of the range definitions. 439 // 440 //------------------------------------------------------------------------ 441 ///CLOVER:OFF printRanges()442 void printRanges() { 443 RangeDescriptor rlRange; 444 int i; 445 446 System.out.print("\n\n Nonoverlapping Ranges ...\n"); 447 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 448 System.out.print(" " + rlRange.fNum + " " + rlRange.fStartChar + "-" + rlRange.fEndChar); 449 450 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 451 RBBINode usetNode = rlRange.fIncludesSets.get(i); 452 String setName = "anon"; 453 RBBINode setRef = usetNode.fParent; 454 if (setRef != null) { 455 RBBINode varRef = setRef.fParent; 456 if (varRef != null && varRef.fType == RBBINode.varRef) { 457 setName = varRef.fText; 458 } 459 } 460 System.out.print(setName); System.out.print(" "); 461 } 462 System.out.println(""); 463 } 464 } 465 ///CLOVER:ON 466 467 468 //------------------------------------------------------------------------ 469 // 470 // printRangeGroups A debugging function. 471 // dump out all of the range groups. 472 // 473 //------------------------------------------------------------------------ 474 ///CLOVER:OFF printRangeGroups()475 void printRangeGroups() { 476 RangeDescriptor rlRange; 477 RangeDescriptor tRange; 478 int i; 479 int lastPrintedGroupNum = 0; 480 481 System.out.print("\nRanges grouped by Unicode Set Membership...\n"); 482 for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { 483 int groupNum = rlRange.fNum & 0xbfff; 484 if (groupNum > lastPrintedGroupNum) { 485 lastPrintedGroupNum = groupNum; 486 if (groupNum<10) {System.out.print(" ");} 487 System.out.print(groupNum + " "); 488 489 if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");} 490 491 for (i=0; i<rlRange.fIncludesSets.size(); i++) { 492 RBBINode usetNode = rlRange.fIncludesSets.get(i); 493 String setName = "anon"; 494 RBBINode setRef = usetNode.fParent; 495 if (setRef != null) { 496 RBBINode varRef = setRef.fParent; 497 if (varRef != null && varRef.fType == RBBINode.varRef) { 498 setName = varRef.fText; 499 } 500 } 501 System.out.print(setName); System.out.print(" "); 502 } 503 504 i = 0; 505 for (tRange = rlRange; tRange != null; tRange = tRange.fNext) { 506 if (tRange.fNum == rlRange.fNum) { 507 if (i++ % 5 == 0) { 508 System.out.print("\n "); 509 } 510 RBBINode.printHex(tRange.fStartChar, -1); 511 System.out.print("-"); 512 RBBINode.printHex(tRange.fEndChar, 0); 513 } 514 } 515 System.out.print("\n"); 516 } 517 } 518 System.out.print("\n"); 519 } 520 ///CLOVER:ON 521 522 523 //------------------------------------------------------------------------ 524 // 525 // printSets A debugging function. 526 // dump out all of the set definitions. 527 // 528 //------------------------------------------------------------------------ 529 ///CLOVER:OFF printSets()530 void printSets() { 531 int i; 532 System.out.print("\n\nUnicode Sets List\n------------------\n"); 533 for (i=0; i<fRB.fUSetNodes.size(); i++) { 534 RBBINode usetNode; 535 RBBINode setRef; 536 RBBINode varRef; 537 String setName; 538 539 usetNode = fRB.fUSetNodes.get(i); 540 541 //System.out.print(" " + i + " "); 542 RBBINode.printInt(2, i); 543 setName = "anonymous"; 544 setRef = usetNode.fParent; 545 if (setRef != null) { 546 varRef = setRef.fParent; 547 if (varRef != null && varRef.fType == RBBINode.varRef) { 548 setName = varRef.fText; 549 } 550 } 551 System.out.print(" " + setName); 552 System.out.print(" "); 553 System.out.print(usetNode.fText); 554 System.out.print("\n"); 555 if (usetNode.fLeftChild != null) { 556 usetNode.fLeftChild.printTree(true); 557 } 558 } 559 System.out.print("\n"); 560 } 561 ///CLOVER:ON 562 } 563