• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.BitSet;
12 import java.util.Collection;
13 import java.util.Comparator;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.Map;
17 import java.util.Map.Entry;
18 import java.util.Set;
19 import java.util.TreeMap;
20 import java.util.TreeSet;
21 
22 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
23 
24 public class StateDictionary<T> extends Dictionary<T> {
25 
26     private static final boolean DEBUG_FLATTEN = false;
27 
28     // results of build
29     private final ArrayList<Row> builtRows;
30 
31     private final Row builtBaseRow;
32 
33     private final IntMap<T> builtResults;
34 
35     private final int builtMaxByteLength;
36 
37     private final StringByteConverter byteString;
38 
39     // TODO remove before deployment; not thread safe
40     private static int debugReferenceCount = 0;
41     private int debugReferenceNumber = debugReferenceCount++;
42 
43     /**
44      * Only should be called by StateDictionaryBuilder
45      *
46      * @param builtBaseRow2
47      * @param builtRows2
48      * @param builtResults2
49      * @param builtMaxByteLength
50      *            TODO
51      * @param byteConverter
52      *            TODO
53      */
StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2, IntMap<T> builtResults2, int builtMaxByteLength, StringByteConverter byteConverter)54     StateDictionary(Row builtBaseRow2, ArrayList<Row> builtRows2,
55         IntMap<T> builtResults2, int builtMaxByteLength,
56         StringByteConverter byteConverter) {
57         builtBaseRow = builtBaseRow2;
58         builtRows = builtRows2;
59         builtResults = builtResults2;
60         this.builtMaxByteLength = builtMaxByteLength;
61         this.byteString = byteConverter;
62         // builtBaseValue = builtResults.get(0);
63     }
64 
65     @Override
getMatcher()66     public Matcher<T> getMatcher() {
67         return new StateMatcher();
68     }
69 
toString()70     public String toString() {
71         StringBuilder result = new StringBuilder();
72         // TreeSet<Row> rowSet = new TreeSet<Row>(builtRows);
73         for (Row row : builtRows) {
74             result.append(row.toString()).append(CldrUtility.LINE_SEPARATOR);
75         }
76         Map<T, Integer> map = builtResults.getValueMap();
77         Set<Pair<Integer, String>> sorted = new TreeSet<Pair<Integer, String>>();
78         for (T item : map.keySet()) {
79             sorted.add(new Pair<Integer, String>(map.get(item), item.toString()));
80         }
81         for (Pair<Integer, String> pair : sorted) {
82             result.append(pair.getFirst()).append("*=").append(pair.getSecond()).append(CldrUtility.LINE_SEPARATOR);
83         }
84         return result.toString();
85     }
86 
getMapping()87     public Iterator<Entry<CharSequence, T>> getMapping() {
88         // TODO Optimize this to only return the items on demand
89         return new TextFetcher().getWords().entrySet().iterator();
90     }
91 
92     @Override
debugShow()93     public String debugShow() {
94         return new TextFetcher().debugShow();
95     }
96 
getRowCount()97     public int getRowCount() {
98         return builtRows.size();
99     }
100 
101     /**
102      * Internals. The text is transformed into a byte stream. A state table is
103      * used to successively map {state, byte, result} to {newstate, newresult,
104      * isReturn}. A state is represented by a Row, which is a mapping from byte to
105      * a Cell, where each cell has the {nextRow, delta result, returns flag}.
106      *
107      * <pre>
108      *  state = next state (row)
109      *  result += delta result
110      *  if (returns) return the result
111      *  &lt;pre&gt;
112      * However, the result and state are preserved for the next call on next().
113      *
114      */
115 
116     static class Row implements Comparable {
117         enum Uniqueness {
118             // the unknown value is only used in building
119             UNIQUE, AMBIGUOUS, UNKNOWN;
120 
debugName()121             public String debugName() {
122                 switch (this) {
123                 case UNIQUE:
124                     return ("¹");
125                 case AMBIGUOUS:
126                     return "²";
127                 default:
128                     return "?";
129                 }
130             }
131         }
132 
133         Uniqueness hasUniqueValue = Uniqueness.UNKNOWN;
134 
135         final TreeMap<Byte, Cell> byteToCell = new TreeMap<Byte, Cell>();
136 
137         // keeps track of the number of cells with returns
138         transient int returnCount;
139 
140         transient int terminatingReturnCount;
141 
142         private int referenceNumber;
143 
Row(int rowNumber)144         Row(int rowNumber) {
145             referenceNumber = rowNumber;
146         }
147 
nonTerminating()148         public int nonTerminating() {
149             return byteToCell.size() - terminatingReturnCount;
150         }
151 
nonReturn()152         public int nonReturn() {
153             return byteToCell.size() - returnCount;
154         }
155 
maximumDepth()156         public int maximumDepth() {
157             int result = 0;
158             for (Cell cell : byteToCell.values()) {
159                 if (cell.nextRow != null) {
160                     int temp = cell.nextRow.maximumDepth() + 1;
161                     if (result < temp) {
162                         result = temp;
163                     }
164                 }
165             }
166             return result;
167         }
168 
compareTo(Object o)169         public int compareTo(Object o) {
170             Row other = (Row) o;
171             int result;
172             // we want to sort items first with the fewest number of non-terminating
173             // returns
174             // cells, then most
175             // number of terminating returns, then most number of returns
176             if (0 != (result = maximumDepth() - other.maximumDepth()))
177                 return result;
178             if (0 != (result = byteToCell.size() - other.byteToCell.size()))
179                 return result;
180             // otherwise, try alphabetic among the keys. We are guaranteed that the
181             // sizes are the same
182             java.util.Iterator<Byte> otherIt = other.byteToCell.keySet().iterator();
183             for (byte key : byteToCell.keySet()) {
184                 int otherKey = otherIt.next();
185                 if (0 != (result = key - otherKey)) {
186                     return result;
187                 }
188                 // at this point, we are guaranteed that the keys are the same. Compare
189                 // deltaResults, and row
190                 Cell cell = byteToCell.get(key);
191                 Cell otherCell = other.byteToCell.get(key);
192                 if (0 != (result = cell.deltaResult - otherCell.deltaResult)) {
193                     return result;
194                 }
195             }
196             // if we fail completely, use the age.
197             return referenceNumber - other.referenceNumber;
198         }
199 
toString()200         public String toString() {
201             StringBuilder buffer = new StringBuilder();
202             buffer.append("R" + getReferenceNumber() + hasUniqueValue.debugName() + "{");
203             boolean first = true;
204             Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator);
205             sorted.addAll(byteToCell.keySet());
206             for (Byte key : sorted) {
207                 if (first) {
208                     first = false;
209                 } else {
210                     buffer.append(' ');
211                 }
212                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
213                 buffer.append('=');
214                 buffer.append(byteToCell.get(key));
215             }
216             buffer.append('}');
217             return buffer.toString();
218         }
219 
toStringCells()220         public String toStringCells() {
221             StringBuilder buffer = new StringBuilder();
222             for (Byte key : byteToCell.keySet()) {
223                 buffer.append(com.ibm.icu.impl.Utility.hex(key & 0xFF, 2));
224                 buffer.append(byteToCell.get(key).toString());
225                 buffer.append(' ');
226             }
227             return buffer.toString();
228         }
229 
getReferenceNumber()230         public int getReferenceNumber() {
231             return referenceNumber;
232         }
233 
compact(byte[] target)234         int compact(byte[] target) {
235             int pos = 0;
236             for (Byte key : byteToCell.keySet()) {
237                 target[pos++] = key;
238                 pos = byteToCell.get(key).addBytes(target, pos, 0);
239             }
240             target[pos++] = 0;
241             return pos;
242         }
243     }
244 
245     static class Cell {
246         public Row nextRow; // next state
247 
248         public int deltaResult;
249 
250         public boolean returns;
251 
addBytes(byte[] target, int pos, int rowDelta)252         public int addBytes(byte[] target, int pos, int rowDelta) {
253             pos = StateDictionary.addBytes(deltaResult, target, pos);
254             int rowOffset = nextRow == null ? 0 : rowDelta - nextRow.getReferenceNumber();
255             rowOffset <<= 1; // make room for returns
256             if (returns)
257                 rowOffset |= 1;
258             return StateDictionary.addBytes(rowOffset, target, pos);
259         }
260 
toString()261         public String toString() {
262             String result = deltaResult == 0 ? "" : String.valueOf(deltaResult);
263             if (returns) {
264                 result += "*";
265             }
266             if (nextRow != null) {
267                 if (result.length() != 0) {
268                     result += "/";
269                 }
270                 result += "R" + nextRow.getReferenceNumber();
271             }
272             return result;
273         }
274     }
275 
276     // should be private, but easier to debug if package private
277     class StateMatcher extends Matcher<T> {
278         private static final boolean SHOW_DEBUG = false;
279 
280         final private byte[] matchByteBuffer = new byte[byteString
281             .getMaxBytesPerChar()];
282 
283         private int matchByteStringIndex;
284 
285         private int matchByteBufferLength;
286 
287         // only used in matching
288         private Row matchCurrentRow;
289 
290         private int matchIntValue = -1;
291 
292         private Row matchLastRow;
293 
294         @Override
setOffset(int offset)295         public Matcher<T> setOffset(int offset) {
296             matchCurrentRow = builtBaseRow;
297             partialLastRow = null; // can remove this later, only for debugging
298             partialMatchValue = 0; // ditto
299             matchIntValue = 0;
300             myMatchEnd = offset;
301             matchValue = null;
302             byteString.clear();
303             matchByteStringIndex = offset;
304             return super.setOffset(offset);
305         }
306 
307         int myMatchEnd;
308 
309         private Row partialLastRow;
310 
311         private int partialMatchValue;
312 
next()313         public Status next() {
314             if (SHOW_DEBUG) {
315                 System.out.println("NEXT: " + this);
316             }
317             if (matchCurrentRow == null) {
318                 matchIntValue = -1;
319                 matchValue = null;
320                 return Status.NONE;
321             }
322             Status result = Status.PARTIAL;
323 
324             while (text.hasCharAt(myMatchEnd)) {
325                 // get more bytes IF matchEnd is set right
326                 if (myMatchEnd == matchByteStringIndex) {
327                     if (true) { // matchEnd < text.length()
328                         char ch = text.charAt(matchByteStringIndex++);
329                         matchByteBufferLength = byteString.toBytes(ch, matchByteBuffer, 0);
330                         if (SHOW_DEBUG) {
331                             System.out.println("\tChar: " + ch + "\t" + com.ibm.icu.impl.Utility.hex(ch) + "\t->\t"
332                                 + CldrUtility.hex(matchByteBuffer, 0, matchByteBufferLength, " "));
333                         }
334                     } else {
335                         matchByteBufferLength = byteString.toBytes(matchByteBuffer, 0);
336                     }
337                 }
338                 for (int i = 0; i < matchByteBufferLength; ++i) {
339                     result = nextByte(matchByteBuffer[i]);
340                     if (result != Status.PARTIAL) {
341                         break;
342                     }
343                 }
344                 // Normally, we will never have a return value except at the end of a character, so we don't need
345                 // to check after each nextByte. However, if the byteString converts C to a sequence of bytes that
346                 // is a prefix of what it converts D into, then we will get a partial match *WITHIN* a character
347 
348                 if (result == Status.PARTIAL) {
349                     ++myMatchEnd;
350                     // and continue with the loop
351                 } else if (result == Status.MATCH) {
352                     ++myMatchEnd;
353                     matchValue = builtResults.get(matchIntValue);
354                     matchEnd = myMatchEnd;
355                     if (SHOW_DEBUG) {
356                         System.out.println("NEXT RESULT: " + result + "\t" + this);
357                     }
358                     return result;
359                 } else {
360                     // if we didn't get a MATCH, we have NONE. But in reality, there could be a possible match
361                     // so we check to see whether the current row allows for any continuation.
362                     if (myMatchEnd > offset && matchCurrentRow.byteToCell.size() > 0) {
363                         result = Status.PARTIAL;
364                     }
365                     if (result == Status.NONE) {
366                         matchIntValue = -1;
367                         matchValue = null;
368                     }
369                     break;
370                 }
371             }
372             matchLastRow = matchCurrentRow;
373             matchCurrentRow = null;
374             if (result == Status.PARTIAL) {
375                 matchValue = builtResults.get(matchIntValue);
376                 matchEnd = myMatchEnd;
377                 partialLastRow = matchLastRow;
378                 partialMatchValue = matchIntValue;
379                 if (SHOW_DEBUG) {
380                     System.out.println("NEXT RESULT: " + result + "\t" + this);
381                 }
382             }
383             return result;
384         }
385 
386         /**
387          * Returns NONE if we cannot go any farther, MATCH if there was a match, and PARTIAL otherwise.
388          * If we couldn't go any farther, then the currentRow is left alone.
389          *
390          * @param chunk
391          * @return
392          */
nextByte(int chunk)393         private Status nextByte(int chunk) {
394             if (SHOW_DEBUG) {
395                 System.out.println("\t" + debugReferenceNumber + "\tRow: " + matchCurrentRow);
396             }
397             Cell cell = matchCurrentRow.byteToCell.get((byte) chunk);
398             if (cell == null) {
399                 return Status.NONE;
400             }
401             matchIntValue += cell.deltaResult;
402             matchCurrentRow = cell.nextRow;
403             if (cell.returns) {
404                 return Status.MATCH;
405             }
406             return Status.PARTIAL;
407         }
408 
getIntMatchValue()409         public int getIntMatchValue() {
410             return matchIntValue;
411         }
412 
nextUniquePartial()413         public boolean nextUniquePartial() {
414             if (partialLastRow.hasUniqueValue == Uniqueness.UNIQUE) {
415                 matchValue = builtResults.get(partialMatchValue);
416                 matchEnd = myMatchEnd;
417                 return true;
418             }
419             return false;
420         }
421 
422         @Override
getDictionary()423         public StateDictionary<T> getDictionary() {
424             return StateDictionary.this;
425         }
426     }
427 
428     static final Comparator<Byte> unsignedByteComparator = new Comparator<Byte>() {
429 
430         public int compare(Byte o1, Byte o2) {
431             int b1 = o1 & 0xFF;
432             int b2 = o2 & 0xFF;
433             return b1 < b2 ? -1 : b1 > b2 ? 1 : 0;
434         }
435 
436     };
437 
438     static final Comparator<Row> rowComparator = new Comparator<Row>() {
439 
440         public int compare(Row row1, Row row2) {
441             if (row1 == row2) {
442                 return 0;
443             } else if (row1 == null) {
444                 return -1;
445             } else if (row2 == null) {
446                 return 1;
447             }
448             int result;
449             if (0 != (result = row1.byteToCell.size() - row2.byteToCell.size())) {
450                 return result;
451             }
452             java.util.Iterator<Byte> otherIt = row2.byteToCell.keySet().iterator();
453             for (byte key : row1.byteToCell.keySet()) {
454                 byte otherKey = otherIt.next();
455                 if (0 != (result = key - otherKey)) {
456                     return result;
457                 }
458                 // at this point, we are guaranteed that the keys are the same. Compare
459                 // deltaResults, returns, and then recurse on the the row
460                 Cell cell1 = row1.byteToCell.get(key);
461                 Cell cell2 = row2.byteToCell.get(key);
462                 if (0 != (result = cell1.deltaResult - cell2.deltaResult)) {
463                     return result;
464                 }
465                 if (cell1.returns != cell2.returns) {
466                     return cell1.returns ? 1 : -1;
467                 }
468                 if (0 != (result = compare(cell1.nextRow, cell2.nextRow))) {
469                     return result;
470                 }
471             }
472             return 0;
473 
474         }
475 
476     };
477 
addBytes(int source, byte[] target, int pos)478     static int addBytes(int source, byte[] target, int pos) {
479         // swap the top bit
480         if (source < 0) {
481             source = ((-source) << 1) | 1;
482         } else {
483             source <<= 1;
484         }
485         // emit the rest as 7 bit quantities with 1 as termination bit
486         while (true) {
487             byte b = (byte) (source & 0x7F);
488             source >>>= 7;
489             if (source == 0) {
490                 b |= 0x80;
491                 target[pos++] = b;
492                 return pos;
493             }
494             target[pos++] = b;
495         }
496     }
497 
498     private class TextFetcher {
499 
500         Map<CharSequence, T> result = new TreeMap<CharSequence, T>();
501 
502         byte[] soFar = new byte[builtMaxByteLength];
503 
504         BitSet shown = new BitSet();
505 
506         StringBuilder buffer = new StringBuilder();
507 
508         StringBuilder debugTreeView = new StringBuilder();
509 
510         private HashSet<Row> rowsSeen = new HashSet<Row>();
511 
getWords()512         public Map<CharSequence, T> getWords() {
513             result.clear();
514             getWords(0, 0, builtBaseRow);
515             return result;
516         }
517 
debugShow()518         public String debugShow() {
519             rowsSeen.clear();
520             Counter<Integer> debugCounter = new Counter<Integer>();
521             getDebugWords(0, 0, builtBaseRow, Integer.MAX_VALUE);
522             for (Row row : builtRows) {
523                 debugCounter.add(row.byteToCell.size(), 1);
524             }
525             for (Integer item : (Collection<Integer>) debugCounter
526                 .getKeysetSortedByKey()) {
527                 debugTreeView.append("cells in row=\t").append(item).append(
528                     "\trows with count=\t").append(debugCounter.getCount(item)).append(
529                         CldrUtility.LINE_SEPARATOR);
530             }
531             return debugTreeView.toString();
532         }
533 
getDebugWords(int byteLength, int resultSoFar, Row row, int suppressAbove)534         private void getDebugWords(int byteLength, int resultSoFar, Row row,
535             int suppressAbove) {
536             // we do this to show when rows have already been seen, and so the structure has been compacted
537             if (rowsSeen.contains(row)) {
538                 // reset if bigger
539                 if (suppressAbove > byteLength) {
540                     suppressAbove = byteLength;
541                 }
542             } else {
543                 rowsSeen.add(row);
544             }
545             // walk through the cells, display and recurse
546             Set<Byte> sorted = new TreeSet<Byte>(unsignedByteComparator);
547             sorted.addAll(row.byteToCell.keySet());
548             for (Byte key : sorted) {
549                 Cell cell = row.byteToCell.get(key);
550                 soFar[byteLength] = key;
551                 shown.set(byteLength, false);
552                 int currentValue = resultSoFar + cell.deltaResult;
553                 if (cell.returns) {
554                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
555                     T value2 = builtResults.get(currentValue);
556                     for (int i = 0; i <= byteLength; ++i) {
557                         debugTreeView.append(' ');
558                         if (i >= suppressAbove) {
559                             debugTreeView.append("++");
560                         } else if (shown.get(i)) {
561                             debugTreeView.append("--");
562                         } else {
563                             debugTreeView.append(com.ibm.icu.impl.Utility.hex(
564                                 soFar[i] & 0xFF, 2));
565                             shown.set(i);
566                         }
567                     }
568                     debugTreeView.append("\t<").append(key2).append(">\t<")
569                         .append(value2).append(">" + CldrUtility.LINE_SEPARATOR);
570                 }
571                 if (cell.nextRow != null) {
572                     getDebugWords(byteLength + 1, currentValue, cell.nextRow,
573                         suppressAbove);
574                 }
575             }
576         }
577 
578         // recurse through the strings
getWords(int byteLength, int resultSoFar, Row row)579         private void getWords(int byteLength, int resultSoFar, Row row) {
580             for (Byte key : row.byteToCell.keySet()) {
581                 Cell cell = row.byteToCell.get(key);
582                 soFar[byteLength] = key;
583                 int currentValue = resultSoFar + cell.deltaResult;
584                 if (cell.returns) {
585                     CharSequence key2 = stringFromBytes(soFar, byteLength + 1);
586                     T value2 = builtResults.get(currentValue);
587                     result.put(key2, value2);
588                 }
589                 if (cell.nextRow != null) {
590                     getWords(byteLength + 1, currentValue, cell.nextRow);
591                 }
592             }
593         }
594 
stringFromBytes(byte[] soFar, int len)595         private CharSequence stringFromBytes(byte[] soFar, int len) {
596             buffer.setLength(0);
597             byteString.fromBytes(soFar, 0, len, buffer);
598             return buffer.toString();
599         }
600     }
601 
602     /**
603      * Just for testing flattening.
604      *
605      *
606      */
flatten()607     public void flatten() {
608         TreeSet<Row> s = new TreeSet<Row>(builtRows);
609         int count = 0;
610         int oldDepth = 999;
611         String oldCell = "";
612         int uniqueCount = 0;
613         int cellCount = 0;
614         byte[] target = new byte[500];
615         int totalBytesCompacted = 0;
616         for (Row row : s) {
617             row.referenceNumber = count++;
618             int depth = row.maximumDepth();
619             if (depth != oldDepth) {
620                 if (DEBUG_FLATTEN) {
621                     System.out.println("*** " + depth + "***");
622                 }
623                 oldDepth = depth;
624             }
625             int bytesCompacted = row.compact(target);
626             if (DEBUG_FLATTEN) {
627                 System.out.println(bytesCompacted + "\t" + row);
628             }
629             String newCell = row.toStringCells();
630             if (!newCell.equals(oldCell)) {
631                 uniqueCount++;
632                 totalBytesCompacted += bytesCompacted;
633                 cellCount += row.byteToCell.size();
634             }
635             oldCell = newCell;
636 
637             for (Cell cell : row.byteToCell.values()) {
638                 if (cell.nextRow != null && cell.nextRow.referenceNumber > row.referenceNumber) {
639                     if (DEBUG_FLATTEN) {
640                         System.out.println("*** Fail");
641                     }
642                     break;
643                 }
644             }
645         }
646         System.out.println("Count: " + count);
647         System.out.println("UniqueCount: " + uniqueCount);
648         System.out.println("CellCount: " + cellCount);
649         System.out.println("TotalBytesCompacted: " + totalBytesCompacted);
650     }
651 
652 }