• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.BitSet;
12 import java.util.Collection;
13 import java.util.Comparator;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
22 
23 public class StateDictionary<T> extends Dictionary<T> {
24 
25     private static final boolean DEBUG_FLATTEN = false;
26 
27     // results of build
28     private final ArrayList<Row> builtRows;
29 
30     private final Row builtBaseRow;
31 
32     private final IntMap<T> builtResults;
33 
34     private final int builtMaxByteLength;
35 
36     private final StringByteConverter byteString;
37 
38     // TODO remove before deployment; not thread safe
39     private static int debugReferenceCount = 0;
40     private int debugReferenceNumber = debugReferenceCount++;
41 
42     /**
43      * Only should be called by StateDictionaryBuilder
44      *
45      * @param builtBaseRow2
46      * @param builtRows2
47      * @param builtResults2
48      * @param builtMaxByteLength TODO
49      * @param byteConverter TODO
50      */
StateDictionary( Row builtBaseRow2, ArrayList<Row> builtRows2, IntMap<T> builtResults2, int builtMaxByteLength, StringByteConverter byteConverter)51     StateDictionary(
52             Row builtBaseRow2,
53             ArrayList<Row> builtRows2,
54             IntMap<T> builtResults2,
55             int builtMaxByteLength,
56             StringByteConverter byteConverter) {
57         builtBaseRow = builtBaseRow2;
58         builtRows = builtRows2;
59         builtResults = builtResults2;
60         this.builtMaxByteLength = builtMaxByteLength;
61         this.byteString = byteConverter;
62         // builtBaseValue = builtResults.get(0);
63     }
64 
65     @Override
getMatcher()66     public Matcher<T> getMatcher() {
67         return new StateMatcher();
68     }
69 
70     @Override
toString()71     public String toString() {
72         StringBuilder result = new StringBuilder();
73         // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows);
74         for (Row row : builtRows) {
75             result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR);
76         }
77         Map<T, Integer> map = builtResults.getValueMap();
78         Set<Pair<Integer, String>> sorted = new TreeSet<>();
79         for (T item : map.keySet()) {
80             sorted.add(new Pair<>(map.get(item), item.toString()));
81         }
82         for (Pair<Integer, String> pair : sorted) {
83             result.append(pair.getFirst())
84                     .append("*=")
85                     .append(pair.getSecond())
86                     .append(CldrUtility.LINE_SEPARATOR);
87         }
88         return result.toString();
89     }
90 
91     @Override
getMapping()92     public Iterator<Entry<CharSequence, T>> getMapping() {
93         // TODO Optimize this to only return the items on demand
94         return new TextFetcher().getWords().entrySet().iterator();
95     }
96 
97     @Override
debugShow()98     public String debugShow() {
99         return new TextFetcher().debugShow();
100     }
101 
getRowCount()102     public int getRowCount() {
103         return builtRows.size();
104     }
105 
106     /**
107      * Internals. The text is transformed into a byte stream. A state table is
108      * used to successively map {state, byte, result} to {newstate, newresult,
109      * isReturn}. A state is represented by a Row, which is a mapping from byte to
110      * a Cell, where each cell has the {nextRow, delta result, returns flag}.
111      *
112      * <pre>
113      *  state = next state (row)
114      *  result += delta result
115      *  if (returns) return the result
116      *  &lt;pre&gt;
117      * However, the result and state are preserved for the next call on next().
118      *
119      */
120     static class Row implements Comparable {
121         enum Uniqueness {
122             // the unknown value is only used in building
123             UNIQUE,
124             AMBIGUOUS,
125             UNKNOWN;
126 
debugName()127             public String debugName() {
128                 switch (this) {
129                     case UNIQUE:
130                         return ("¹");
131                     case AMBIGUOUS:
132                         return "²";
133                     default:
134                         return "?";
135                 }
136             }
137         }
138 
139         Uniqueness hasUniqueValue = Uniqueness.UNKNOWN;
140 
141         final TreeMap<Byte, Cell> byteToCell = new TreeMap<>();
142 
143         // keeps track of the number of cells with returns
144         transient int returnCount;
145 
146         transient int terminatingReturnCount;
147 
148         private int referenceNumber;
149 
Row(int rowNumber)150         Row(int rowNumber) {
151             referenceNumber = rowNumber;
152         }
153 
nonTerminating()154         public int nonTerminating() {
155             return byteToCell.size() - terminatingReturnCount;
156         }
157 
nonReturn()158         public int nonReturn() {
159             return byteToCell.size() - returnCount;
160         }
161 
maximumDepth()162         public int maximumDepth() {
163             int result = 0;
164             for (Cell cell : byteToCell.values()) {
165                 if (cell.nextRow != null) {
166                     int temp = cell.nextRow.maximumDepth() + 1;
167                     if (result < temp) {
168                         result = temp;
169                     }
170                 }
171             }
172             return result;
173         }
174 
175         @Override
compareTo(Object o)176         public int compareTo(Object o) {
177             Row other = (Row) o;
178             int result;
179             // we want to sort items first with the fewest number of non-terminating
180             // returns
181             // cells, then most
182             // number of terminating returns, then most number of returns
183             if (0 != (result = maximumDepth() - other.maximumDepth())) return result;
184             if (0 != (result = byteToCell.size() - other.byteToCell.size())) return result;
185             // otherwise, try alphabetic among the keys. We are guaranteed that the
186             // sizes are the same
187             java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator();
188             for (byte key : byteToCell.keySet()) {
189                 int otherKey = otherIt.next();
190                 if (0 != (result = key - otherKey)) {
191                     return result;
192                 }
193                 // at this point, we are guaranteed that the keys are the same. Compare
194                 // deltaResults, and row
195                 Cell cell = byteToCell.get(key);
196                 Cell otherCell = other.byteToCell.get(key);
197                 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) {
198                     return result;
199                 }
200             }
201             // if we fail completely, use the age.
202             return referenceNumber - other.referenceNumber;
203         }
204 
205         @Override
toString()206         public String toString() {
207             StringBuilder buffer = new StringBuilder();
208             buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{");
209             boolean first = true;
210             Set<Byte> sorted = new TreeSet<>(unsignedByteComparator);
211             sorted.addAll(byteToCell.keySet());
212             for (Byte key : sorted) {
213                 if (first) {
214                     first = false;
215                 } else {
216                     buffer.append(' ');
217                 }
218                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
219                 buffer.append('=');
220                 buffer.append(byteToCell.get(key));
221             }
222             buffer.append('}');
223             return buffer.toString();
224         }
225 
toStringCells()226         public String toStringCells() {
227             StringBuilder buffer = new StringBuilder();
228             for (Byte key : byteToCell.keySet()) {
229                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
230                 buffer.append(byteToCell.get(key).toString());
231                 buffer.append(' ');
232             }
233             return buffer.toString();
234         }
235 
getReferenceNumber()236         public int getReferenceNumber() {
237             return referenceNumber;
238         }
239 
compact(byte[] target)240         int compact(byte[] target) {
241             int pos = 0;
242             for (Byte key : byteToCell.keySet()) {
243                 target[pos++] = key;
244                 pos = byteToCell.get(key).addBytes(target, pos, 0);
245             }
246             target[pos++] = 0;
247             return pos;
248         }
249     }
250 
251     static class Cell {
252         public Row nextRow; // next state
253 
254         public int deltaResult;
255 
256         public boolean returns;
257 
addBytes(byte[] target, int pos, int rowDelta)258         public int addBytes(byte[] target, int pos, int rowDelta) {
259             pos = StateDictionary.addBytes(deltaResult, target, pos);
260             int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber();
261             rowOffset <<= 1; // make room for returns
262             if (returns) rowOffset |= 1;
263             return StateDictionary.addBytes(rowOffset, target, pos);
264         }
265 
266         @Override
toString()267         public String toString() {
268             String result = deltaResult == 0 ? "" : String.valueOf(deltaResult);
269             if (returns) {
270                 result += "*";
271             }
272             if (nextRow != null) {
273                 if (result.length() != 0) {
274                     result += "/";
275                 }
276                 result += "R" + nextRow.getReferenceNumber();
277             }
278             return result;
279         }
280     }
281 
282     // should be private, but easier to debug if package private
283     class StateMatcher extends Matcher<T> {
284         private static final boolean SHOW_DEBUG = false;
285 
286         private final byte[] matchByteBuffer = new byte[byteString.getMaxBytesPerChar()];
287 
288         private int matchByteStringIndex;
289 
290         private int matchByteBufferLength;
291 
292         // only used in matching
293         private Row matchCurrentRow;
294 
295         private int matchIntValue = -1;
296 
297         private Row matchLastRow;
298 
299         @Override
setOffset(int offset)300         public Matcher<T> setOffset(int offset) {
301             matchCurrentRow = builtBaseRow;
302             partialLastRow = null; // can remove this later, only for debugging
303             partialMatchValue = 0; // ditto
304             matchIntValue = 0;
305             myMatchEnd = offset;
306             matchValue = null;
307             byteString.clear();
308             matchByteStringIndex = offset;
309             return super.setOffset(offset);
310         }
311 
312         int myMatchEnd;
313 
314         private Row partialLastRow;
315 
316         private int partialMatchValue;
317 
318         @Override
next()319         public Status next() {
320             if (SHOW_DEBUG) {
321                 System.out.println("NEXT: " + this);
322             }
323             if (matchCurrentRow == null) {
324                 matchIntValue = -1;
325                 matchValue = null;
326                 return Status.NONE;
327             }
328             Status result = Status.PARTIAL;
329 
330             while (text.hasCharAt(myMatchEnd)) {
331                 // get more bytes IF matchEnd is set right
332                 if (myMatchEnd == matchByteStringIndex) {
333                     if (true) { // matchEnd < text.length()
334                         char ch = text.charAt(matchByteStringIndex++);
335                         matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0);
336                         if (SHOW_DEBUG) {
337                             System.out.println(
338                                     "\tChar: "
339                                             + ch
340                                             + "\t"
341                                             + com.ibm.icu.impl.Utility.hex(ch)
342                                             + "\t->\t"
343                                             + CldrUtility.hex(
344                                                     matchByteBuffer,
345                                                     0,
346                                                     matchByteBufferLength,
347                                                     " "));
348                         }
349                     } else {
350                         matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0);
351                     }
352                 }
353                 for (int i = 0; i < matchByteBufferLength; ++i) {
354                     result = nextByte(matchByteBuffer[i]);
355                     if (result != Status.PARTIAL) {
356                         break;
357                     }
358                 }
359                 // Normally, we will never have a return value except at the end of a character, so
360                 // we don't need
361                 // to check after each nextByte. However, if the byteString converts C to a sequence
362                 // of bytes that
363                 // is a prefix of what it converts D into, then we will get a partial match *WITHIN*
364                 // a character
365 
366                 if (result == Status.PARTIAL) {
367                     ++myMatchEnd;
368                     // and continue with the loop
369                 } else if (result == Status.MATCH) {
370                     ++myMatchEnd;
371                     matchValue = builtResults.get(matchIntValue);
372                     matchEnd = myMatchEnd;
373                     if (SHOW_DEBUG) {
374                         System.out.println("NEXT RESULT: " + result + "\t" + this);
375                     }
376                     return result;
377                 } else {
378                     // if we didn't get a MATCH, we have NONE. But in reality, there could be a
379                     // possible match
380                     // so we check to see whether the current row allows for any continuation.
381                     if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) {
382                         result = Status.PARTIAL;
383                     }
384                     if (result == Status.NONE) {
385                         matchIntValue = -1;
386                         matchValue = null;
387                     }
388                     break;
389                 }
390             }
391             matchLastRow = matchCurrentRow;
392             matchCurrentRow = null;
393             if (result == Status.PARTIAL) {
394                 matchValue = builtResults.get(matchIntValue);
395                 matchEnd = myMatchEnd;
396                 partialLastRow = matchLastRow;
397                 partialMatchValue = matchIntValue;
398                 if (SHOW_DEBUG) {
399                     System.out.println("NEXT RESULT: " + result + "\t" + this);
400                 }
401             }
402             return result;
403         }
404 
405         /**
406          * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL
407          * otherwise. If we couldn't go any farther, then the currentRow is left alone.
408          *
409          * @param chunk
410          * @return
411          */
nextByte(int chunk)412         private Status nextByte(int chunk) {
413             if (SHOW_DEBUG) {
414                 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow);
415             }
416             Cell cell = matchCurrentRow.byteToCell.get((byte) chunk);
417             if (cell == null) {
418                 return Status.NONE;
419             }
420             matchIntValue += cell.deltaResult;
421             matchCurrentRow = cell.nextRow;
422             if (cell.returns) {
423                 return Status.MATCH;
424             }
425             return Status.PARTIAL;
426         }
427 
getIntMatchValue()428         public int getIntMatchValue() {
429             return matchIntValue;
430         }
431 
432         @Override
nextUniquePartial()433         public boolean nextUniquePartial() {
434             if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) {
435                 matchValue = builtResults.get(partialMatchValue);
436                 matchEnd = myMatchEnd;
437                 return true;
438             }
439             return false;
440         }
441 
442         @Override
getDictionary()443         public StateDictionary<T> getDictionary() {
444             return StateDictionary.this;
445         }
446     }
447 
448     static final Comparator<Byte> unsignedByteComparator =
449             new Comparator<Byte>() {
450 
451                 @Override
452                 public int compare(Byte o1, Byte o2) {
453                     int b1 = o1 & 0xFF;
454                     int b2 = o2 & 0xFF;
455                     return b1 < b2 ? -1 : b1 > b2 ? 1 : 0;
456                 }
457             };
458 
459     static final Comparator<Row> rowComparator =
460             new Comparator<Row>() {
461 
462                 @Override
463                 public int compare(Row row1, Row row2) {
464                     if (row1 == row2) {
465                         return 0;
466                     } else if (row1 == null) {
467                         return -1;
468                     } else if (row2 == null) {
469                         return 1;
470                     }
471                     int result;
472                     if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) {
473                         return result;
474                     }
475                     java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator();
476                     for (byte key : row1.byteToCell.keySet()) {
477                         byte otherKey = otherIt.next();
478                         if (0 != (result = key - otherKey)) {
479                             return result;
480                         }
481                         // at this point, we are guaranteed that the keys are the same. Compare
482                         // deltaResults, returns, and then recurse on the the row
483                         Cell cell1 = row1.byteToCell.get(key);
484                         Cell cell2 = row2.byteToCell.get(key);
485                         if (0 != (result = cell1.deltaResult - cell2.deltaResult)) {
486                             return result;
487                         }
488                         if (cell1.returns != cell2.returns) {
489                             return cell1.returns ? 1 : -1;
490                         }
491                         if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) {
492                             return result;
493                         }
494                     }
495                     return 0;
496                 }
497             };
498 
addBytes(int source, byte[] target, int pos)499     static int addBytes(int source, byte[] target, int pos) {
500         // swap the top bit
501         if (source < 0) {
502             source = ((-source) << 1) | 1;
503         } else {
504             source <<= 1;
505         }
506         // emit the rest as 7 bit quantities with 1 as termination bit
507         while (true) {
508             byte b = (byte) (source & 0x7F);
509             source >>>= 7;
510             if (source == 0) {
511                 b |= 0x80;
512                 target[pos++] = b;
513                 return pos;
514             }
515             target[pos++] = b;
516         }
517     }
518 
519     private class TextFetcher {
520 
521         Map<CharSequence, T> result = new TreeMap<>();
522 
523         byte[] soFar = new byte[builtMaxByteLength];
524 
525         BitSet shown = new BitSet();
526 
527         StringBuilder buffer = new StringBuilder();
528 
529         StringBuilder debugTreeView = new StringBuilder();
530 
531         private HashSet<Row> rowsSeen = new HashSet<>();
532 
getWords()533         public Map<CharSequence, T> getWords() {
534             result.clear();
535             getWords(0, 0, builtBaseRow);
536             return result;
537         }
538 
debugShow()539         public String debugShow() {
540             rowsSeen.clear();
541             Counter<Integer> debugCounter = new Counter<>();
542             getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE);
543             for (Row row : builtRows) {
544                 debugCounter.add(row.byteToCell.size(), 1);
545             }
546             for (Integer item : (Collection<Integer>) debugCounter.getKeysetSortedByKey()) {
547                 debugTreeView
548                         .append("cells in row=\t")
549                         .append(item)
550                         .append("\trows with count=\t")
551                         .append(debugCounter.getCount(item))
552                         .append(CldrUtility.LINE_SEPARATOR);
553             }
554             return debugTreeView.toString();
555         }
556 
getDebugWords(int byteLength, int resultSoFar, Row row, int suppressAbove)557         private void getDebugWords(int byteLength, int resultSoFar, Row row, int suppressAbove) {
558             // we do this to show when rows have already been seen, and so the structure has been
559             // compacted
560             if (rowsSeen.contains(row)) {
561                 // reset if bigger
562                 if (suppressAbove > byteLength) {
563                     suppressAbove = byteLength;
564                 }
565             } else {
566                 rowsSeen.add(row);
567             }
568             // walk through the cells, display and recurse
569             Set<Byte> sorted = new TreeSet<>(unsignedByteComparator);
570             sorted.addAll(row.byteToCell.keySet());
571             for (Byte key : sorted) {
572                 Cell cell = row.byteToCell.get(key);
573                 soFar[byteLength] = key;
574                 shown.set(byteLength, false);
575                 int currentValue = resultSoFar + cell.deltaResult;
576                 if (cell.returns) {
577                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
578                     T value2 = builtResults.get(currentValue);
579                     for (int i = 0; i <= byteLength; ++i) {
580                         debugTreeView.append(' ');
581                         if (i >= suppressAbove) {
582                             debugTreeView.append("++");
583                         } else if (shown.get(i)) {
584                             debugTreeView.append("--");
585                         } else {
586                             debugTreeView.append(com.ibm.icu.impl.Utility.hex(soFar[i] & 0xFF, 2));
587                             shown.set(i);
588                         }
589                     }
590                     debugTreeView
591                             .append("\t<")
592                             .append(key2)
593                             .append(">\t<")
594                             .append(value2)
595                             .append(">" + CldrUtility.LINE_SEPARATOR);
596                 }
597                 if (cell.nextRow != null) {
598                     getDebugWords(byteLength + 1, currentValue, cell.nextRow, suppressAbove);
599                 }
600             }
601         }
602 
603         // recurse through the strings
getWords(int byteLength, int resultSoFar, Row row)604         private void getWords(int byteLength, int resultSoFar, Row row) {
605             for (Byte key : row.byteToCell.keySet()) {
606                 Cell cell = row.byteToCell.get(key);
607                 soFar[byteLength] = key;
608                 int currentValue = resultSoFar + cell.deltaResult;
609                 if (cell.returns) {
610                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
611                     T value2 = builtResults.get(currentValue);
612                     result.put(key2, value2);
613                 }
614                 if (cell.nextRow != null) {
615                     getWords(byteLength + 1, currentValue, cell.nextRow);
616                 }
617             }
618         }
619 
stringFromBytes(byte[] soFar, int len)620         private CharSequence stringFromBytes(byte[] soFar, int len) {
621             buffer.setLength(0);
622             byteString.fromBytes(soFar, 0, len, buffer);
623             return buffer.toString();
624         }
625     }
626 
627     /** Just for testing flattening. */
flatten()628     public void flatten() {
629         TreeSet<Row> s = new TreeSet<>(builtRows);
630         int count = 0;
631         int oldDepth = 999;
632         String oldCell = "";
633         int uniqueCount = 0;
634         int cellCount = 0;
635         byte[] target = new byte[500];
636         int totalBytesCompacted = 0;
637         for (Row row : s) {
638             row.referenceNumber = count++;
639             int depth = row.maximumDepth();
640             if (depth != oldDepth) {
641                 if (DEBUG_FLATTEN) {
642                     System.out.println("*** " + depth + "***");
643                 }
644                 oldDepth = depth;
645             }
646             int bytesCompacted = row.compact(target);
647             if (DEBUG_FLATTEN) {
648                 System.out.println(bytesCompacted + "\t" + row);
649             }
650             String newCell = row.toStringCells();
651             if (!newCell.equals(oldCell)) {
652                 uniqueCount++;
653                 totalBytesCompacted += bytesCompacted;
654                 cellCount += row.byteToCell.size();
655             }
656             oldCell = newCell;
657 
658             for (Cell cell : row.byteToCell.values()) {
659                 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) {
660                     if (DEBUG_FLATTEN) {
661                         System.out.println("*** Fail");
662                     }
663                     break;
664                 }
665             }
666         }
667         System.out.println("Count: " + count);
668         System.out.println("UniqueCount: " + uniqueCount);
669         System.out.println("CellCount: " + cellCount);
670         System.out.println("TotalBytesCompacted: " + totalBytesCompacted);
671     }
672 }
673