1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.text; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.nio.ByteOrder; 15 16 import com.ibm.icu.impl.CharTrie; 17 import com.ibm.icu.impl.ICUBinary; 18 import com.ibm.icu.impl.ICUBinary.Authenticate; 19 import com.ibm.icu.impl.Trie; 20 21 /** 22 * <p>Internal class used for Rule Based Break Iterators</p> 23 * <p>This class provides access to the compiled break rule data, as 24 * it is stored in a .brk file. 25 */ 26 final class RBBIDataWrapper { 27 // 28 // These fields are the ready-to-use compiled rule data, as 29 // read from the file. 30 // 31 RBBIDataHeader fHeader; 32 short fFTable[]; 33 short fRTable[]; 34 short fSFTable[]; 35 short fSRTable[]; 36 CharTrie fTrie; 37 String fRuleSource; 38 int fStatusTable[]; 39 40 private boolean isBigEndian; 41 42 static final int DATA_FORMAT = 0x42726b20; // "Brk " 43 static final int FORMAT_VERSION = 0x03010000; // 3.1 44 45 private static final class IsAcceptable implements Authenticate { 46 // @Override when we switch to Java 6 47 @Override isDataVersionAcceptable(byte version[])48 public boolean isDataVersionAcceptable(byte version[]) { 49 return version[0] == (FORMAT_VERSION >>> 24); 50 } 51 } 52 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 53 54 // 55 // Indexes to fields in the ICU4C style binary form of the RBBI Data Header 56 // Used by the rule compiler when flattening the data. 57 // 58 final static int DH_SIZE = 24; 59 final static int DH_MAGIC = 0; 60 final static int DH_FORMATVERSION = 1; 61 final static int DH_LENGTH = 2; 62 final static int DH_CATCOUNT = 3; 63 final static int DH_FTABLE = 4; 64 final static int DH_FTABLELEN = 5; 65 final static int DH_RTABLE = 6; 66 final static int DH_RTABLELEN = 7; 67 final static int DH_SFTABLE = 8; 68 final static int DH_SFTABLELEN = 9; 69 final static int DH_SRTABLE = 10; 70 final static int DH_SRTABLELEN = 11; 71 final static int DH_TRIE = 12; 72 final static int DH_TRIELEN = 13; 73 final static int DH_RULESOURCE = 14; 74 final static int DH_RULESOURCELEN = 15; 75 final static int DH_STATUSTABLE = 16; 76 final static int DH_STATUSTABLELEN = 17; 77 78 79 // Index offsets to the fields in a state table row. 80 // Corresponds to struct RBBIStateTableRow in the C version. 81 // 82 final static int ACCEPTING = 0; 83 final static int LOOKAHEAD = 1; 84 final static int TAGIDX = 2; 85 final static int RESERVED = 3; 86 final static int NEXTSTATES = 4; 87 88 // Index offsets to header fields of a state table 89 // struct RBBIStateTable {... in the C version. 90 // 91 static final int NUMSTATES = 0; 92 static final int ROWLEN = 2; 93 static final int FLAGS = 4; 94 //ivate static final int RESERVED_2 = 6; 95 private static final int ROW_DATA = 8; 96 97 // Bit selectors for the "FLAGS" field of the state table header 98 // enum RBBIStateTableFlags in the C version. 99 // 100 final static int RBBI_LOOKAHEAD_HARD_BREAK = 1; 101 final static int RBBI_BOF_REQUIRED = 2; 102 103 /** 104 * Data Header. A struct-like class with the fields from the RBBI data file header. 105 */ 106 final static class RBBIDataHeader { 107 int fMagic; // == 0xbla0 108 int fVersion; // == 1 (for ICU 3.2 and earlier. 109 byte[] fFormatVersion; // For ICU 3.4 and later. 110 int fLength; // Total length in bytes of this RBBI Data, 111 // including all sections, not just the header. 112 int fCatCount; // Number of character categories. 113 114 // 115 // Offsets and sizes of each of the subsections within the RBBI data. 116 // All offsets are bytes from the start of the RBBIDataHeader. 117 // All sizes are in bytes. 118 // 119 int fFTable; // forward state transition table. 120 int fFTableLen; 121 int fRTable; // Offset to the reverse state transition table. 122 int fRTableLen; 123 int fSFTable; // safe point forward transition table 124 int fSFTableLen; 125 int fSRTable; // safe point reverse transition table 126 int fSRTableLen; 127 int fTrie; // Offset to Trie data for character categories 128 int fTrieLen; 129 int fRuleSource; // Offset to the source for for the break 130 int fRuleSourceLen; // rules. Stored UChar *. 131 int fStatusTable; // Offset to the table of rule status values 132 int fStatusTableLen; 133 RBBIDataHeader()134 public RBBIDataHeader() { 135 fMagic = 0; 136 fFormatVersion = new byte[4]; 137 } 138 } 139 140 141 /** 142 * RBBI State Table Indexing Function. Given a state number, return the 143 * array index of the start of the state table row for that state. 144 * 145 */ getRowIndex(int state)146 int getRowIndex(int state){ 147 return ROW_DATA + state * (fHeader.fCatCount + 4); 148 } 149 150 static class TrieFoldingFunc implements Trie.DataManipulate { 151 @Override getFoldingOffset(int data)152 public int getFoldingOffset(int data) { 153 if ((data & 0x8000) != 0) { 154 return data & 0x7fff; 155 } else { 156 return 0; 157 } 158 } 159 } 160 static TrieFoldingFunc fTrieFoldingFunc = new TrieFoldingFunc(); 161 162 RBBIDataWrapper()163 RBBIDataWrapper() { 164 } 165 166 /* 167 * Get an RBBIDataWrapper from an InputStream onto a pre-compiled set 168 * of RBBI rules. 169 */ get(ByteBuffer bytes)170 static RBBIDataWrapper get(ByteBuffer bytes) throws IOException { 171 RBBIDataWrapper This = new RBBIDataWrapper(); 172 173 ICUBinary.readHeader(bytes, DATA_FORMAT, IS_ACCEPTABLE); 174 This.isBigEndian = bytes.order() == ByteOrder.BIG_ENDIAN; 175 176 // Read in the RBBI data header... 177 This.fHeader = new RBBIDataHeader(); 178 This.fHeader.fMagic = bytes.getInt(); 179 // Read the same 4 bytes as an int and as a byte array: The data format could be 180 // the old fVersion=1 (TODO: probably not with a real ICU data header?) 181 // or the new fFormatVersion=3.x. 182 This.fHeader.fVersion = bytes.getInt(bytes.position()); 183 This.fHeader.fFormatVersion[0] = bytes.get(); 184 This.fHeader.fFormatVersion[1] = bytes.get(); 185 This.fHeader.fFormatVersion[2] = bytes.get(); 186 This.fHeader.fFormatVersion[3] = bytes.get(); 187 This.fHeader.fLength = bytes.getInt(); 188 This.fHeader.fCatCount = bytes.getInt(); 189 This.fHeader.fFTable = bytes.getInt(); 190 This.fHeader.fFTableLen = bytes.getInt(); 191 This.fHeader.fRTable = bytes.getInt(); 192 This.fHeader.fRTableLen = bytes.getInt(); 193 This.fHeader.fSFTable = bytes.getInt(); 194 This.fHeader.fSFTableLen = bytes.getInt(); 195 This.fHeader.fSRTable = bytes.getInt(); 196 This.fHeader.fSRTableLen = bytes.getInt(); 197 This.fHeader.fTrie = bytes.getInt(); 198 This.fHeader.fTrieLen = bytes.getInt(); 199 This.fHeader.fRuleSource = bytes.getInt(); 200 This.fHeader.fRuleSourceLen = bytes.getInt(); 201 This.fHeader.fStatusTable = bytes.getInt(); 202 This.fHeader.fStatusTableLen = bytes.getInt(); 203 ICUBinary.skipBytes(bytes, 6 * 4); // uint32_t fReserved[6]; 204 205 206 if (This.fHeader.fMagic != 0xb1a0 || 207 ! (This.fHeader.fVersion == 1 || // ICU 3.2 and earlier 208 This.fHeader.fFormatVersion[0] == 3) // ICU 3.4 209 ) { 210 throw new IOException("Break Iterator Rule Data Magic Number Incorrect, or unsupported data version."); 211 } 212 213 // Current position in the buffer. 214 int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes) 215 216 // 217 // Read in the Forward state transition table as an array of shorts. 218 // 219 220 // Quick Sanity Check 221 if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) { 222 throw new IOException("Break iterator Rule data corrupt"); 223 } 224 225 // Skip over any padding preceding this table 226 ICUBinary.skipBytes(bytes, This.fHeader.fFTable - pos); 227 pos = This.fHeader.fFTable; 228 229 This.fFTable = ICUBinary.getShorts( 230 bytes, This.fHeader.fFTableLen / 2, This.fHeader.fFTableLen & 1); 231 pos += This.fHeader.fFTableLen; 232 233 // 234 // Read in the Reverse state table 235 // 236 237 // Skip over any padding in the file 238 ICUBinary.skipBytes(bytes, This.fHeader.fRTable - pos); 239 pos = This.fHeader.fRTable; 240 241 // Create & fill the table itself. 242 This.fRTable = ICUBinary.getShorts( 243 bytes, This.fHeader.fRTableLen / 2, This.fHeader.fRTableLen & 1); 244 pos += This.fHeader.fRTableLen; 245 246 // 247 // Read in the Safe Forward state table 248 // 249 if (This.fHeader.fSFTableLen > 0) { 250 // Skip over any padding in the file 251 ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos); 252 pos = This.fHeader.fSFTable; 253 254 // Create & fill the table itself. 255 This.fSFTable = ICUBinary.getShorts( 256 bytes, This.fHeader.fSFTableLen / 2, This.fHeader.fSFTableLen & 1); 257 pos += This.fHeader.fSFTableLen; 258 } 259 260 // 261 // Read in the Safe Reverse state table 262 // 263 if (This.fHeader.fSRTableLen > 0) { 264 // Skip over any padding in the file 265 ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos); 266 pos = This.fHeader.fSRTable; 267 268 // Create & fill the table itself. 269 This.fSRTable = ICUBinary.getShorts( 270 bytes, This.fHeader.fSRTableLen / 2, This.fHeader.fSRTableLen & 1); 271 pos += This.fHeader.fSRTableLen; 272 } 273 274 // 275 // Unserialize the Character categories TRIE 276 // Because we can't be absolutely certain where the Trie deserialize will 277 // leave the buffer, leave position unchanged. 278 // The seek to the start of the next item following the TRIE will get us 279 // back in sync. 280 // 281 ICUBinary.skipBytes(bytes, This.fHeader.fTrie - pos); // seek buffer from end of 282 pos = This.fHeader.fTrie; // previous section to the start of the trie 283 284 bytes.mark(); // Mark position of start of TRIE in the input 285 // and tell Java to keep the mark valid so long 286 // as we don't go more than 100 bytes past the 287 // past the end of the TRIE. 288 289 This.fTrie = new CharTrie(bytes, fTrieFoldingFunc); // Deserialize the TRIE, leaving buffer 290 // at an unknown position, preceding the 291 // padding between TRIE and following section. 292 293 bytes.reset(); // Move buffer back to marked position at 294 // the start of the serialized TRIE. Now our 295 // "pos" variable and the buffer are in 296 // agreement. 297 298 // 299 // Read the Rule Status Table 300 // 301 if (pos > This.fHeader.fStatusTable) { 302 throw new IOException("Break iterator Rule data corrupt"); 303 } 304 ICUBinary.skipBytes(bytes, This.fHeader.fStatusTable - pos); 305 pos = This.fHeader.fStatusTable; 306 This.fStatusTable = ICUBinary.getInts( 307 bytes, This.fHeader.fStatusTableLen / 4, This.fHeader.fStatusTableLen & 3); 308 pos += This.fHeader.fStatusTableLen; 309 310 // 311 // Put the break rule source into a String 312 // 313 if (pos > This.fHeader.fRuleSource) { 314 throw new IOException("Break iterator Rule data corrupt"); 315 } 316 ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos); 317 pos = This.fHeader.fRuleSource; 318 This.fRuleSource = ICUBinary.getString( 319 bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1); 320 321 if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) { 322 This.dump(System.out); 323 } 324 return This; 325 } 326 327 ///CLOVER:OFF 328 // Getters for fields from the state table header 329 // getStateTableNumStates(short table[])330 private int getStateTableNumStates(short table[]) { 331 if (isBigEndian) { 332 return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff); 333 } else { 334 return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff); 335 } 336 } 337 ///CLOVER:ON 338 getStateTableFlags(short table[])339 int getStateTableFlags(short table[]) { 340 // This works for up to 15 flags bits. 341 return table[isBigEndian ? FLAGS + 1 : FLAGS]; 342 } 343 344 ///CLOVER:OFF 345 /* Debug function to display the break iterator data. */ dump(java.io.PrintStream out)346 void dump(java.io.PrintStream out) { 347 if (fFTable.length == 0) { 348 // There is no table. Fail early for testing purposes. 349 throw new NullPointerException(); 350 } 351 out.println("RBBI Data Wrapper dump ..."); 352 out.println(); 353 out.println("Forward State Table"); 354 dumpTable(out, fFTable); 355 out.println("Reverse State Table"); 356 dumpTable(out, fRTable); 357 out.println("Forward Safe Points Table"); 358 dumpTable(out, fSFTable); 359 out.println("Reverse Safe Points Table"); 360 dumpTable(out, fSRTable); 361 362 dumpCharCategories(out); 363 out.println("Source Rules: " + fRuleSource); 364 365 } 366 ///CLOVER:ON 367 368 ///CLOVER:OFF 369 /* Fixed width int-to-string conversion. */ intToString(int n, int width)370 static public String intToString(int n, int width) { 371 StringBuilder dest = new StringBuilder(width); 372 dest.append(n); 373 while (dest.length() < width) { 374 dest.insert(0, ' '); 375 } 376 return dest.toString(); 377 } 378 ///CLOVER:ON 379 380 ///CLOVER:OFF 381 /* Fixed width int-to-string conversion. */ intToHexString(int n, int width)382 static public String intToHexString(int n, int width) { 383 StringBuilder dest = new StringBuilder(width); 384 dest.append(Integer.toHexString(n)); 385 while (dest.length() < width) { 386 dest.insert(0, ' '); 387 } 388 return dest.toString(); 389 } 390 ///CLOVER:ON 391 392 ///CLOVER:OFF 393 /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */ dumpTable(java.io.PrintStream out, short table[])394 private void dumpTable(java.io.PrintStream out, short table[]) { 395 if (table == null) { 396 out.println(" -- null -- "); 397 } else { 398 int n; 399 int state; 400 StringBuilder header = new StringBuilder(" Row Acc Look Tag"); 401 for (n=0; n<fHeader.fCatCount; n++) { 402 header.append(intToString(n, 5)); 403 } 404 out.println(header.toString()); 405 for (n=0; n<header.length(); n++) { 406 out.print("-"); 407 } 408 out.println(); 409 for (state=0; state< getStateTableNumStates(table); state++) { 410 dumpRow(out, table, state); 411 } 412 out.println(); 413 } 414 } 415 ///CLOVER:ON 416 417 ///CLOVER:OFF 418 /** 419 * Dump (for debug) a single row of an RBBI state table 420 * @param table 421 * @param state 422 */ dumpRow(java.io.PrintStream out, short table[], int state)423 private void dumpRow(java.io.PrintStream out, short table[], int state) { 424 StringBuilder dest = new StringBuilder(fHeader.fCatCount*5 + 20); 425 dest.append(intToString(state, 4)); 426 int row = getRowIndex(state); 427 if (table[row+ACCEPTING] != 0) { 428 dest.append(intToString(table[row+ACCEPTING], 5)); 429 }else { 430 dest.append(" "); 431 } 432 if (table[row+LOOKAHEAD] != 0) { 433 dest.append(intToString(table[row+LOOKAHEAD], 5)); 434 }else { 435 dest.append(" "); 436 } 437 dest.append(intToString(table[row+TAGIDX], 5)); 438 439 for (int col=0; col<fHeader.fCatCount; col++) { 440 dest.append(intToString(table[row+NEXTSTATES+col], 5)); 441 } 442 443 out.println(dest); 444 } 445 ///CLOVER:ON 446 447 ///CLOVER:OFF dumpCharCategories(java.io.PrintStream out)448 private void dumpCharCategories(java.io.PrintStream out) { 449 int n = fHeader.fCatCount; 450 String catStrings[] = new String[n+1]; 451 int rangeStart = 0; 452 int rangeEnd = 0; 453 int lastCat = -1; 454 int char32; 455 int category; 456 int lastNewline[] = new int[n+1]; 457 458 for (category = 0; category <= fHeader.fCatCount; category ++) { 459 catStrings[category] = ""; 460 } 461 out.println("\nCharacter Categories"); 462 out.println("--------------------"); 463 for (char32 = 0; char32<=0x10ffff; char32++) { 464 category = fTrie.getCodePointValue(char32); 465 category &= ~0x4000; // Mask off dictionary bit. 466 if (category < 0 || category > fHeader.fCatCount) { 467 out.println("Error, bad category " + Integer.toHexString(category) + 468 " for char " + Integer.toHexString(char32)); 469 break; 470 } 471 if (category == lastCat ) { 472 rangeEnd = char32; 473 } else { 474 if (lastCat >= 0) { 475 if (catStrings[lastCat].length() > lastNewline[lastCat] + 70) { 476 lastNewline[lastCat] = catStrings[lastCat].length() + 10; 477 catStrings[lastCat] += "\n "; 478 } 479 480 catStrings[lastCat] += " " + Integer.toHexString(rangeStart); 481 if (rangeEnd != rangeStart) { 482 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd); 483 } 484 } 485 lastCat = category; 486 rangeStart = rangeEnd = char32; 487 } 488 } 489 catStrings[lastCat] += " " + Integer.toHexString(rangeStart); 490 if (rangeEnd != rangeStart) { 491 catStrings[lastCat] += "-" + Integer.toHexString(rangeEnd); 492 } 493 494 for (category = 0; category <= fHeader.fCatCount; category ++) { 495 out.println (intToString(category, 5) + " " + catStrings[category]); 496 } 497 out.println(); 498 } 499 ///CLOVER:ON 500 501 /*static RBBIDataWrapper get(String name) throws IOException { 502 String fullName = "data/" + name; 503 InputStream is = ICUData.getRequiredStream(fullName); 504 return get(is); 505 } 506 507 public static void main(String[] args) { 508 String s; 509 if (args.length == 0) { 510 s = "char"; 511 } else { 512 s = args[0]; 513 } 514 System.out.println("RBBIDataWrapper.main(" + s + ") "); 515 516 String versionedName = ICUResourceBundle.ICU_BUNDLE+"/"+ s + ".brk"; 517 518 try { 519 RBBIDataWrapper This = RBBIDataWrapper.get(versionedName); 520 This.dump(); 521 } 522 catch (Exception e) { 523 System.out.println("Exception: " + e.toString()); 524 } 525 526 }*/ 527 } 528