• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.Comparator;
12 import java.util.HashMap;
13 import java.util.Map;
14 import java.util.TreeMap;
15 
16 import org.unicode.cldr.util.Dictionary.DictionaryBuilder;
17 import org.unicode.cldr.util.IntMap.BasicIntMapFactory;
18 import org.unicode.cldr.util.IntMap.IntMapFactory;
19 import org.unicode.cldr.util.StateDictionary.Cell;
20 import org.unicode.cldr.util.StateDictionary.Row;
21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
22 
23 /**
24  * A simple state-table based dictionary builder.
25  *
26  * @author markdavis
27  * @param <T>
28  *            the return type for the dictionary
29  */
30 public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> {
31 
32     private static final boolean SHOW_SIZE = true;
33 
34     // only used while building
35     private Row buildingCurrentAddRow;
36     private int builtTotalBytes;
37     private int builtTotalStrings;
38 
39     // results of build
40     private ArrayList<Row> builtRows;
41     private Row builtBaseRow;
42     private IntMap<T> builtResults;
43     private int builtMaxByteLength;
44 
45     private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); //
46     // new ByteString(false); // new
47     // ByteString(true); //
48 
49     private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<T>();
50 
51     /**
52      * Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory.
53      *
54      * @return
55      */
getIntMapFactory()56     public IntMapFactory<T> getIntMapFactory() {
57         return intMapFactory;
58     }
59 
setIntMapFactory(IntMapFactory<T> intMapFactory)60     public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) {
61         this.intMapFactory = intMapFactory;
62         return this;
63     }
64 
65     /**
66      * Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form:
67      * ByteString(true).
68      *
69      * @return
70      */
getByteConverter()71     public StringByteConverter getByteConverter() {
72         return byteConverter;
73     }
74 
setByteConverter(StringByteConverter byteConverter)75     public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) {
76         this.byteConverter = byteConverter;
77         return this;
78     }
79 
80     /**
81      * Create a new simple StateDictionary. This format is relatively fast to
82      * produce, and has a fair amount of compaction. The Map must be sorted
83      * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "".
84      *
85      * @param source
86      * @return
87      */
make(Map<CharSequence, T> source)88     public StateDictionary<T> make(Map<CharSequence, T> source) {
89         // clear out state
90         buildingCurrentAddRow = null;
91         builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0;
92         builtRows = new ArrayList<Row>();
93         builtBaseRow = makeRow();
94         builtResults = intMapFactory.make(source.values());
95         if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage());
96 
97         Map<T, Integer> valueToInt = builtResults.getValueMap();
98         Map<byte[], Integer> sorted = new TreeMap<byte[], Integer>(SHORTER_BYTE_ARRAY_COMPARATOR);
99         for (CharSequence text : source.keySet()) {
100             sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text)));
101         }
102 
103         for (byte[] key : sorted.keySet()) {
104             addMapping(key, sorted.get(key));
105         }
106 
107         // now compact the rows
108         // first find out which rows are equivalent (recursively)
109         Map<Row, Row> replacements = new HashMap<Row, Row>();
110         {
111             Map<Row, Row> equivalents = new TreeMap<Row, Row>(StateDictionary.rowComparator);
112             for (Row row : builtRows) {
113                 Row cardinal = equivalents.get(row);
114                 if (cardinal == null) {
115                     equivalents.put(row, row);
116                 } else {
117                     replacements.put(row, cardinal);
118                 }
119             }
120         }
121         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size());
122 
123         // now replace all references to rows by their equivalents
124         for (Row row : builtRows) {
125             for (Byte key : row.byteToCell.keySet()) {
126                 Cell cell = row.byteToCell.get(key);
127                 Row newRow = replacements.get(cell.nextRow);
128                 if (newRow != null) {
129                     cell.nextRow = newRow;
130                 }
131             }
132         }
133         // now compact the rows array
134         ArrayList<Row> newRows = new ArrayList<Row>();
135         for (Row row : builtRows) {
136             if (!replacements.containsKey(row)) {
137                 newRows.add(row);
138             }
139         }
140         // and set the Uniqueness values
141         setUniqueValues(builtBaseRow);
142         builtRows = newRows;
143         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size());
144         return new StateDictionary<T>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter);
145     }
146 
makeRow()147     private Row makeRow() {
148         Row row = new Row(builtRows.size());
149         builtRows.add(row);
150         return row;
151     }
152 
addMapping(byte[] key, int result)153     private void addMapping(byte[] key, int result) {
154         buildingCurrentAddRow = builtBaseRow;
155 
156         int lastIndex = key.length - 1;
157         for (int i = 0; i <= lastIndex; ++i) {
158             result = add(key[i], result, i == lastIndex);
159         }
160         builtTotalBytes += key.length;
161         builtTotalStrings += 1;
162         if (builtMaxByteLength < key.length) {
163             builtMaxByteLength = key.length;
164         }
165     }
166 
add(byte key, int result, boolean last)167     private int add(byte key, int result, boolean last) {
168         Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key);
169         if (matchingCell != null) {
170             if (matchingCell.nextRow == null && !last) {
171                 matchingCell.nextRow = makeRow();
172                 // we add a continuation, so decrement
173                 --buildingCurrentAddRow.terminatingReturnCount;
174             }
175             buildingCurrentAddRow = matchingCell.nextRow;
176             return result - matchingCell.deltaResult;
177         }
178         Cell cell = new Cell();
179         buildingCurrentAddRow.byteToCell.put(key, cell);
180         cell.deltaResult = result;
181         if (last) {
182             cell.returns = true;
183             ++buildingCurrentAddRow.returnCount;
184             ++buildingCurrentAddRow.terminatingReturnCount;
185         } else {
186             cell.nextRow = buildingCurrentAddRow = makeRow();
187         }
188         // when we create a new cell for the first time, we deposit the
189         // result, so we can clear it now
190         return 0;
191     }
192 
193     /**
194      * Follow all the path values, and determine whether all possible results from
195      * a row (when following bytes) are the same. If so, set the flag on the row.
196      * The return value
197      *
198      * @param savedMatchValue
199      *            TODO
200      * @return true if unique
201      */
202     // NOTE: The way the table is built, we are guaranteed that the current value
203     // is "correct" (with no deltas needed) for at least one of the paths
204     // resulting from the row, EXCEPT in the very first row. Thus except for the first row,
205     // every row will have at least one cell with a zero deltaResult. That in turn means that
206     // if there is a deltaResult on a cell, at least one chain resulting from taking that cell
207     // will have that result.
208     // So, if there are any deltaResults in a row, it is not unique.
209     // Otherwise, if any of the cells have non-unique rows, it is not unique
210     // Otherwise it is unique.
211     // So this traverses the rows, setting the values for anything it hasn't seen.
212     // TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand,
213     // this might not be worth the time in adding them to a save list.
setUniqueValues(Row currentRow)214     private boolean setUniqueValues(Row currentRow) {
215         if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) {
216             return true;
217         }
218         // see if there is any reason to make us not Unique
219         // According to the structure above:
220         // It will be that we have a child that is not unique,
221         // or that we have a nonzero delta value
222         // We don't have to look at anything else.
223         // We DO have to call setUniqueValues on each of our children, to make sure they are set.
224         for (Cell cell : currentRow.byteToCell.values()) {
225             if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) {
226                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
227             } else if (cell.deltaResult != 0) {
228                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
229             }
230         }
231         if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) {
232             return false;
233         }
234         currentRow.hasUniqueValue = Uniqueness.UNIQUE;
235         return true;
236     }
237 
238     static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() {
239 
240         public int compare(byte[] o1, byte[] o2) {
241             int minLen = o1.length;
242             if (minLen > o2.length) {
243                 minLen = o2.length;
244             }
245             for (int i = 0; i < minLen; ++i) {
246                 if (o1[i] != o2[i]) {
247                     return o1[i] < o2[i] ? -1 : 1; // return lesser first
248                 }
249             }
250             return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0;
251         }
252 
253     };
254 }