1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.BitSet; 12 import java.util.Collection; 13 import java.util.Comparator; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness; 22 23 public class StateDictionary<T> extends Dictionary<T> { 24 25 private static final boolean DEBUG_FLATTEN = false; 26 27 // results of build 28 private final ArrayList<Row> builtRows; 29 30 private final Row builtBaseRow; 31 32 private final IntMap<T> builtResults; 33 34 private final int builtMaxByteLength; 35 36 private final StringByteConverter byteString; 37 38 // TODO remove before deployment; not thread safe 39 private static int debugReferenceCount = 0; 40 private int debugReferenceNumber = debugReferenceCount++; 41 42 /** 43 * Only should be called by StateDictionaryBuilder 44 * 45 * @param builtBaseRow2 46 * @param builtRows2 47 * @param builtResults2 48 * @param builtMaxByteLength TODO 49 * @param byteConverter TODO 50 */ StateDictionary( Row builtBaseRow2, ArrayList<Row> builtRows2, IntMap<T> builtResults2, int builtMaxByteLength, StringByteConverter byteConverter)51 StateDictionary( 52 Row builtBaseRow2, 53 ArrayList<Row> builtRows2, 54 IntMap<T> builtResults2, 55 int builtMaxByteLength, 56 StringByteConverter byteConverter) { 57 builtBaseRow = builtBaseRow2; 58 builtRows = builtRows2; 59 builtResults = builtResults2; 60 this.builtMaxByteLength = builtMaxByteLength; 61 this.byteString = byteConverter; 62 // builtBaseValue = builtResults.get(0); 63 } 64 65 @Override getMatcher()66 public Matcher<T> getMatcher() { 67 return new StateMatcher(); 68 } 69 70 @Override toString()71 public String toString() { 72 StringBuilder result = new StringBuilder(); 73 // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows); 74 for (Row row : builtRows) { 75 result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR); 76 } 77 Map<T, Integer> map = builtResults.getValueMap(); 78 Set<Pair<Integer, String>> sorted = new TreeSet<>(); 79 for (T item : map.keySet()) { 80 sorted.add(new Pair<>(map.get(item), item.toString())); 81 } 82 for (Pair<Integer, String> pair : sorted) { 83 result.append(pair.getFirst()) 84 .append("*=") 85 .append(pair.getSecond()) 86 .append(CldrUtility.LINE_SEPARATOR); 87 } 88 return result.toString(); 89 } 90 91 @Override getMapping()92 public Iterator<Entry<CharSequence, T>> getMapping() { 93 // TODO Optimize this to only return the items on demand 94 return new TextFetcher().getWords().entrySet().iterator(); 95 } 96 97 @Override debugShow()98 public String debugShow() { 99 return new TextFetcher().debugShow(); 100 } 101 getRowCount()102 public int getRowCount() { 103 return builtRows.size(); 104 } 105 106 /** 107 * Internals. The text is transformed into a byte stream. A state table is 108 * used to successively map {state, byte, result} to {newstate, newresult, 109 * isReturn}. A state is represented by a Row, which is a mapping from byte to 110 * a Cell, where each cell has the {nextRow, delta result, returns flag}. 111 * 112 * <pre> 113 * state = next state (row) 114 * result += delta result 115 * if (returns) return the result 116 * <pre> 117 * However, the result and state are preserved for the next call on next(). 118 * 119 */ 120 static class Row implements Comparable { 121 enum Uniqueness { 122 // the unknown value is only used in building 123 UNIQUE, 124 AMBIGUOUS, 125 UNKNOWN; 126 debugName()127 public String debugName() { 128 switch (this) { 129 case UNIQUE: 130 return ("¹"); 131 case AMBIGUOUS: 132 return "²"; 133 default: 134 return "?"; 135 } 136 } 137 } 138 139 Uniqueness hasUniqueValue = Uniqueness.UNKNOWN; 140 141 final TreeMap<Byte, Cell> byteToCell = new TreeMap<>(); 142 143 // keeps track of the number of cells with returns 144 transient int returnCount; 145 146 transient int terminatingReturnCount; 147 148 private int referenceNumber; 149 Row(int rowNumber)150 Row(int rowNumber) { 151 referenceNumber = rowNumber; 152 } 153 nonTerminating()154 public int nonTerminating() { 155 return byteToCell.size() - terminatingReturnCount; 156 } 157 nonReturn()158 public int nonReturn() { 159 return byteToCell.size() - returnCount; 160 } 161 maximumDepth()162 public int maximumDepth() { 163 int result = 0; 164 for (Cell cell : byteToCell.values()) { 165 if (cell.nextRow != null) { 166 int temp = cell.nextRow.maximumDepth() + 1; 167 if (result < temp) { 168 result = temp; 169 } 170 } 171 } 172 return result; 173 } 174 175 @Override compareTo(Object o)176 public int compareTo(Object o) { 177 Row other = (Row) o; 178 int result; 179 // we want to sort items first with the fewest number of non-terminating 180 // returns 181 // cells, then most 182 // number of terminating returns, then most number of returns 183 if (0 != (result = maximumDepth() - other.maximumDepth())) return result; 184 if (0 != (result = byteToCell.size() - other.byteToCell.size())) return result; 185 // otherwise, try alphabetic among the keys. We are guaranteed that the 186 // sizes are the same 187 java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator(); 188 for (byte key : byteToCell.keySet()) { 189 int otherKey = otherIt.next(); 190 if (0 != (result = key - otherKey)) { 191 return result; 192 } 193 // at this point, we are guaranteed that the keys are the same. Compare 194 // deltaResults, and row 195 Cell cell = byteToCell.get(key); 196 Cell otherCell = other.byteToCell.get(key); 197 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) { 198 return result; 199 } 200 } 201 // if we fail completely, use the age. 202 return referenceNumber - other.referenceNumber; 203 } 204 205 @Override toString()206 public String toString() { 207 StringBuilder buffer = new StringBuilder(); 208 buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{"); 209 boolean first = true; 210 Set<Byte> sorted = new TreeSet<>(unsignedByteComparator); 211 sorted.addAll(byteToCell.keySet()); 212 for (Byte key : sorted) { 213 if (first) { 214 first = false; 215 } else { 216 buffer.append(' '); 217 } 218 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 219 buffer.append('='); 220 buffer.append(byteToCell.get(key)); 221 } 222 buffer.append('}'); 223 return buffer.toString(); 224 } 225 toStringCells()226 public String toStringCells() { 227 StringBuilder buffer = new StringBuilder(); 228 for (Byte key : byteToCell.keySet()) { 229 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2)); 230 buffer.append(byteToCell.get(key).toString()); 231 buffer.append(' '); 232 } 233 return buffer.toString(); 234 } 235 getReferenceNumber()236 public int getReferenceNumber() { 237 return referenceNumber; 238 } 239 compact(byte[] target)240 int compact(byte[] target) { 241 int pos = 0; 242 for (Byte key : byteToCell.keySet()) { 243 target[pos++] = key; 244 pos = byteToCell.get(key).addBytes(target, pos, 0); 245 } 246 target[pos++] = 0; 247 return pos; 248 } 249 } 250 251 static class Cell { 252 public Row nextRow; // next state 253 254 public int deltaResult; 255 256 public boolean returns; 257 addBytes(byte[] target, int pos, int rowDelta)258 public int addBytes(byte[] target, int pos, int rowDelta) { 259 pos = StateDictionary.addBytes(deltaResult, target, pos); 260 int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber(); 261 rowOffset <<= 1; // make room for returns 262 if (returns) rowOffset |= 1; 263 return StateDictionary.addBytes(rowOffset, target, pos); 264 } 265 266 @Override toString()267 public String toString() { 268 String result = deltaResult == 0 ? "" : String.valueOf(deltaResult); 269 if (returns) { 270 result += "*"; 271 } 272 if (nextRow != null) { 273 if (result.length() != 0) { 274 result += "/"; 275 } 276 result += "R" + nextRow.getReferenceNumber(); 277 } 278 return result; 279 } 280 } 281 282 // should be private, but easier to debug if package private 283 class StateMatcher extends Matcher<T> { 284 private static final boolean SHOW_DEBUG = false; 285 286 private final byte[] matchByteBuffer = new byte[byteString.getMaxBytesPerChar()]; 287 288 private int matchByteStringIndex; 289 290 private int matchByteBufferLength; 291 292 // only used in matching 293 private Row matchCurrentRow; 294 295 private int matchIntValue = -1; 296 297 private Row matchLastRow; 298 299 @Override setOffset(int offset)300 public Matcher<T> setOffset(int offset) { 301 matchCurrentRow = builtBaseRow; 302 partialLastRow = null; // can remove this later, only for debugging 303 partialMatchValue = 0; // ditto 304 matchIntValue = 0; 305 myMatchEnd = offset; 306 matchValue = null; 307 byteString.clear(); 308 matchByteStringIndex = offset; 309 return super.setOffset(offset); 310 } 311 312 int myMatchEnd; 313 314 private Row partialLastRow; 315 316 private int partialMatchValue; 317 318 @Override next()319 public Status next() { 320 if (SHOW_DEBUG) { 321 System.out.println("NEXT: " + this); 322 } 323 if (matchCurrentRow == null) { 324 matchIntValue = -1; 325 matchValue = null; 326 return Status.NONE; 327 } 328 Status result = Status.PARTIAL; 329 330 while (text.hasCharAt(myMatchEnd)) { 331 // get more bytes IF matchEnd is set right 332 if (myMatchEnd == matchByteStringIndex) { 333 if (true) { // matchEnd < text.length() 334 char ch = text.charAt(matchByteStringIndex++); 335 matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0); 336 if (SHOW_DEBUG) { 337 System.out.println( 338 "\tChar: " 339 + ch 340 + "\t" 341 + com.ibm.icu.impl.Utility.hex(ch) 342 + "\t->\t" 343 + CldrUtility.hex( 344 matchByteBuffer, 345 0, 346 matchByteBufferLength, 347 " ")); 348 } 349 } else { 350 matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0); 351 } 352 } 353 for (int i = 0; i < matchByteBufferLength; ++i) { 354 result = nextByte(matchByteBuffer[i]); 355 if (result != Status.PARTIAL) { 356 break; 357 } 358 } 359 // Normally, we will never have a return value except at the end of a character, so 360 // we don't need 361 // to check after each nextByte. However, if the byteString converts C to a sequence 362 // of bytes that 363 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* 364 // a character 365 366 if (result == Status.PARTIAL) { 367 ++myMatchEnd; 368 // and continue with the loop 369 } else if (result == Status.MATCH) { 370 ++myMatchEnd; 371 matchValue = builtResults.get(matchIntValue); 372 matchEnd = myMatchEnd; 373 if (SHOW_DEBUG) { 374 System.out.println("NEXT RESULT: " + result + "\t" + this); 375 } 376 return result; 377 } else { 378 // if we didn't get a MATCH, we have NONE. But in reality, there could be a 379 // possible match 380 // so we check to see whether the current row allows for any continuation. 381 if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) { 382 result = Status.PARTIAL; 383 } 384 if (result == Status.NONE) { 385 matchIntValue = -1; 386 matchValue = null; 387 } 388 break; 389 } 390 } 391 matchLastRow = matchCurrentRow; 392 matchCurrentRow = null; 393 if (result == Status.PARTIAL) { 394 matchValue = builtResults.get(matchIntValue); 395 matchEnd = myMatchEnd; 396 partialLastRow = matchLastRow; 397 partialMatchValue = matchIntValue; 398 if (SHOW_DEBUG) { 399 System.out.println("NEXT RESULT: " + result + "\t" + this); 400 } 401 } 402 return result; 403 } 404 405 /** 406 * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL 407 * otherwise. If we couldn't go any farther, then the currentRow is left alone. 408 * 409 * @param chunk 410 * @return 411 */ nextByte(int chunk)412 private Status nextByte(int chunk) { 413 if (SHOW_DEBUG) { 414 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow); 415 } 416 Cell cell = matchCurrentRow.byteToCell.get((byte) chunk); 417 if (cell == null) { 418 return Status.NONE; 419 } 420 matchIntValue += cell.deltaResult; 421 matchCurrentRow = cell.nextRow; 422 if (cell.returns) { 423 return Status.MATCH; 424 } 425 return Status.PARTIAL; 426 } 427 getIntMatchValue()428 public int getIntMatchValue() { 429 return matchIntValue; 430 } 431 432 @Override nextUniquePartial()433 public boolean nextUniquePartial() { 434 if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) { 435 matchValue = builtResults.get(partialMatchValue); 436 matchEnd = myMatchEnd; 437 return true; 438 } 439 return false; 440 } 441 442 @Override getDictionary()443 public StateDictionary<T> getDictionary() { 444 return StateDictionary.this; 445 } 446 } 447 448 static final Comparator<Byte> unsignedByteComparator = 449 new Comparator<Byte>() { 450 451 @Override 452 public int compare(Byte o1, Byte o2) { 453 int b1 = o1 & 0xFF; 454 int b2 = o2 & 0xFF; 455 return b1 < b2 ? -1 : b1 > b2 ? 1 : 0; 456 } 457 }; 458 459 static final Comparator<Row> rowComparator = 460 new Comparator<Row>() { 461 462 @Override 463 public int compare(Row row1, Row row2) { 464 if (row1 == row2) { 465 return 0; 466 } else if (row1 == null) { 467 return -1; 468 } else if (row2 == null) { 469 return 1; 470 } 471 int result; 472 if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) { 473 return result; 474 } 475 java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator(); 476 for (byte key : row1.byteToCell.keySet()) { 477 byte otherKey = otherIt.next(); 478 if (0 != (result = key - otherKey)) { 479 return result; 480 } 481 // at this point, we are guaranteed that the keys are the same. Compare 482 // deltaResults, returns, and then recurse on the the row 483 Cell cell1 = row1.byteToCell.get(key); 484 Cell cell2 = row2.byteToCell.get(key); 485 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) { 486 return result; 487 } 488 if (cell1.returns != cell2.returns) { 489 return cell1.returns ? 1 : -1; 490 } 491 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) { 492 return result; 493 } 494 } 495 return 0; 496 } 497 }; 498 addBytes(int source, byte[] target, int pos)499 static int addBytes(int source, byte[] target, int pos) { 500 // swap the top bit 501 if (source < 0) { 502 source = ((-source) << 1) | 1; 503 } else { 504 source <<= 1; 505 } 506 // emit the rest as 7 bit quantities with 1 as termination bit 507 while (true) { 508 byte b = (byte) (source & 0x7F); 509 source >>>= 7; 510 if (source == 0) { 511 b |= 0x80; 512 target[pos++] = b; 513 return pos; 514 } 515 target[pos++] = b; 516 } 517 } 518 519 private class TextFetcher { 520 521 Map<CharSequence, T> result = new TreeMap<>(); 522 523 byte[] soFar = new byte[builtMaxByteLength]; 524 525 BitSet shown = new BitSet(); 526 527 StringBuilder buffer = new StringBuilder(); 528 529 StringBuilder debugTreeView = new StringBuilder(); 530 531 private HashSet<Row> rowsSeen = new HashSet<>(); 532 getWords()533 public Map<CharSequence, T> getWords() { 534 result.clear(); 535 getWords(0, 0, builtBaseRow); 536 return result; 537 } 538 debugShow()539 public String debugShow() { 540 rowsSeen.clear(); 541 Counter<Integer> debugCounter = new Counter<>(); 542 getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE); 543 for (Row row : builtRows) { 544 debugCounter.add(row.byteToCell.size(), 1); 545 } 546 for (Integer item : (Collection<Integer>) debugCounter.getKeysetSortedByKey()) { 547 debugTreeView 548 .append("cells in row=\t") 549 .append(item) 550 .append("\trows with count=\t") 551 .append(debugCounter.getCount(item)) 552 .append(CldrUtility.LINE_SEPARATOR); 553 } 554 return debugTreeView.toString(); 555 } 556 getDebugWords(int byteLength, int resultSoFar, Row row, int suppressAbove)557 private void getDebugWords(int byteLength, int resultSoFar, Row row, int suppressAbove) { 558 // we do this to show when rows have already been seen, and so the structure has been 559 // compacted 560 if (rowsSeen.contains(row)) { 561 // reset if bigger 562 if (suppressAbove > byteLength) { 563 suppressAbove = byteLength; 564 } 565 } else { 566 rowsSeen.add(row); 567 } 568 // walk through the cells, display and recurse 569 Set<Byte> sorted = new TreeSet<>(unsignedByteComparator); 570 sorted.addAll(row.byteToCell.keySet()); 571 for (Byte key : sorted) { 572 Cell cell = row.byteToCell.get(key); 573 soFar[byteLength] = key; 574 shown.set(byteLength, false); 575 int currentValue = resultSoFar + cell.deltaResult; 576 if (cell.returns) { 577 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 578 T value2 = builtResults.get(currentValue); 579 for (int i = 0; i <= byteLength; ++i) { 580 debugTreeView.append(' '); 581 if (i >= suppressAbove) { 582 debugTreeView.append("++"); 583 } else if (shown.get(i)) { 584 debugTreeView.append("--"); 585 } else { 586 debugTreeView.append(com.ibm.icu.impl.Utility.hex(soFar[i] & 0xFF, 2)); 587 shown.set(i); 588 } 589 } 590 debugTreeView 591 .append("\t<") 592 .append(key2) 593 .append(">\t<") 594 .append(value2) 595 .append(">" + CldrUtility.LINE_SEPARATOR); 596 } 597 if (cell.nextRow != null) { 598 getDebugWords(byteLength + 1, currentValue, cell.nextRow, suppressAbove); 599 } 600 } 601 } 602 603 // recurse through the strings getWords(int byteLength, int resultSoFar, Row row)604 private void getWords(int byteLength, int resultSoFar, Row row) { 605 for (Byte key : row.byteToCell.keySet()) { 606 Cell cell = row.byteToCell.get(key); 607 soFar[byteLength] = key; 608 int currentValue = resultSoFar + cell.deltaResult; 609 if (cell.returns) { 610 CharSequence key2 = stringFromBytes(soFar, byteLength + 1); 611 T value2 = builtResults.get(currentValue); 612 result.put(key2, value2); 613 } 614 if (cell.nextRow != null) { 615 getWords(byteLength + 1, currentValue, cell.nextRow); 616 } 617 } 618 } 619 stringFromBytes(byte[] soFar, int len)620 private CharSequence stringFromBytes(byte[] soFar, int len) { 621 buffer.setLength(0); 622 byteString.fromBytes(soFar, 0, len, buffer); 623 return buffer.toString(); 624 } 625 } 626 627 /** Just for testing flattening. */ flatten()628 public void flatten() { 629 TreeSet<Row> s = new TreeSet<>(builtRows); 630 int count = 0; 631 int oldDepth = 999; 632 String oldCell = ""; 633 int uniqueCount = 0; 634 int cellCount = 0; 635 byte[] target = new byte[500]; 636 int totalBytesCompacted = 0; 637 for (Row row : s) { 638 row.referenceNumber = count++; 639 int depth = row.maximumDepth(); 640 if (depth != oldDepth) { 641 if (DEBUG_FLATTEN) { 642 System.out.println("*** " + depth + "***"); 643 } 644 oldDepth = depth; 645 } 646 int bytesCompacted = row.compact(target); 647 if (DEBUG_FLATTEN) { 648 System.out.println(bytesCompacted + "\t" + row); 649 } 650 String newCell = row.toStringCells(); 651 if (!newCell.equals(oldCell)) { 652 uniqueCount++; 653 totalBytesCompacted += bytesCompacted; 654 cellCount += row.byteToCell.size(); 655 } 656 oldCell = newCell; 657 658 for (Cell cell : row.byteToCell.values()) { 659 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) { 660 if (DEBUG_FLATTEN) { 661 System.out.println("*** Fail"); 662 } 663 break; 664 } 665 } 666 } 667 System.out.println("Count: " + count); 668 System.out.println("UniqueCount: " + uniqueCount); 669 System.out.println("CellCount: " + cellCount); 670 System.out.println("TotalBytesCompacted: " + totalBytesCompacted); 671 } 672 } 673