• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.util.ArrayList;
11 import java.util.Comparator;
12 import java.util.HashMap;
13 import java.util.Map;
14 import java.util.TreeMap;
15 
16 import org.unicode.cldr.util.Dictionary.DictionaryBuilder;
17 import org.unicode.cldr.util.IntMap.BasicIntMapFactory;
18 import org.unicode.cldr.util.IntMap.IntMapFactory;
19 import org.unicode.cldr.util.StateDictionary.Cell;
20 import org.unicode.cldr.util.StateDictionary.Row;
21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness;
22 
23 /**
24  * A simple state-table based dictionary builder.
25  *
26  * @author markdavis
27  * @param <T>
28  *            the return type for the dictionary
29  */
30 public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> {
31 
32     private static final boolean SHOW_SIZE = true;
33 
34     // only used while building
35     private Row buildingCurrentAddRow;
36     private int builtTotalBytes;
37     private int builtTotalStrings;
38 
39     // results of build
40     private ArrayList<Row> builtRows;
41     private Row builtBaseRow;
42     private IntMap<T> builtResults;
43     private int builtMaxByteLength;
44 
45     private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); //
46     // new ByteString(false); // new
47     // ByteString(true); //
48 
49     private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<>();
50 
51     /**
52      * Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory.
53      *
54      * @return
55      */
getIntMapFactory()56     public IntMapFactory<T> getIntMapFactory() {
57         return intMapFactory;
58     }
59 
setIntMapFactory(IntMapFactory<T> intMapFactory)60     public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) {
61         this.intMapFactory = intMapFactory;
62         return this;
63     }
64 
65     /**
66      * Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form:
67      * ByteString(true).
68      *
69      * @return
70      */
getByteConverter()71     public StringByteConverter getByteConverter() {
72         return byteConverter;
73     }
74 
setByteConverter(StringByteConverter byteConverter)75     public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) {
76         this.byteConverter = byteConverter;
77         return this;
78     }
79 
80     /**
81      * Create a new simple StateDictionary. This format is relatively fast to
82      * produce, and has a fair amount of compaction. The Map must be sorted
83      * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "".
84      *
85      * @param source
86      * @return
87      */
88     @Override
make(Map<CharSequence, T> source)89     public StateDictionary<T> make(Map<CharSequence, T> source) {
90         // clear out state
91         buildingCurrentAddRow = null;
92         builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0;
93         builtRows = new ArrayList<>();
94         builtBaseRow = makeRow();
95         builtResults = intMapFactory.make(source.values());
96         if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage());
97 
98         Map<T, Integer> valueToInt = builtResults.getValueMap();
99         Map<byte[], Integer> sorted = new TreeMap<>(SHORTER_BYTE_ARRAY_COMPARATOR);
100         for (CharSequence text : source.keySet()) {
101             sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text)));
102         }
103 
104         for (byte[] key : sorted.keySet()) {
105             addMapping(key, sorted.get(key));
106         }
107 
108         // now compact the rows
109         // first find out which rows are equivalent (recursively)
110         Map<Row, Row> replacements = new HashMap<>();
111         {
112             Map<Row, Row> equivalents = new TreeMap<>(StateDictionary.rowComparator);
113             for (Row row : builtRows) {
114                 Row cardinal = equivalents.get(row);
115                 if (cardinal == null) {
116                     equivalents.put(row, row);
117                 } else {
118                     replacements.put(row, cardinal);
119                 }
120             }
121         }
122         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size());
123 
124         // now replace all references to rows by their equivalents
125         for (Row row : builtRows) {
126             for (Byte key : row.byteToCell.keySet()) {
127                 Cell cell = row.byteToCell.get(key);
128                 Row newRow = replacements.get(cell.nextRow);
129                 if (newRow != null) {
130                     cell.nextRow = newRow;
131                 }
132             }
133         }
134         // now compact the rows array
135         ArrayList<Row> newRows = new ArrayList<>();
136         for (Row row : builtRows) {
137             if (!replacements.containsKey(row)) {
138                 newRows.add(row);
139             }
140         }
141         // and set the Uniqueness values
142         setUniqueValues(builtBaseRow);
143         builtRows = newRows;
144         if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size());
145         return new StateDictionary<>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter);
146     }
147 
makeRow()148     private Row makeRow() {
149         Row row = new Row(builtRows.size());
150         builtRows.add(row);
151         return row;
152     }
153 
addMapping(byte[] key, int result)154     private void addMapping(byte[] key, int result) {
155         buildingCurrentAddRow = builtBaseRow;
156 
157         int lastIndex = key.length - 1;
158         for (int i = 0; i <= lastIndex; ++i) {
159             result = add(key[i], result, i == lastIndex);
160         }
161         builtTotalBytes += key.length;
162         builtTotalStrings += 1;
163         if (builtMaxByteLength < key.length) {
164             builtMaxByteLength = key.length;
165         }
166     }
167 
add(byte key, int result, boolean last)168     private int add(byte key, int result, boolean last) {
169         Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key);
170         if (matchingCell != null) {
171             if (matchingCell.nextRow == null && !last) {
172                 matchingCell.nextRow = makeRow();
173                 // we add a continuation, so decrement
174                 --buildingCurrentAddRow.terminatingReturnCount;
175             }
176             buildingCurrentAddRow = matchingCell.nextRow;
177             return result - matchingCell.deltaResult;
178         }
179         Cell cell = new Cell();
180         buildingCurrentAddRow.byteToCell.put(key, cell);
181         cell.deltaResult = result;
182         if (last) {
183             cell.returns = true;
184             ++buildingCurrentAddRow.returnCount;
185             ++buildingCurrentAddRow.terminatingReturnCount;
186         } else {
187             cell.nextRow = buildingCurrentAddRow = makeRow();
188         }
189         // when we create a new cell for the first time, we deposit the
190         // result, so we can clear it now
191         return 0;
192     }
193 
194     /**
195      * Follow all the path values, and determine whether all possible results from
196      * a row (when following bytes) are the same. If so, set the flag on the row.
197      * The return value
198      *
199      * @param savedMatchValue
200      *            TODO
201      * @return true if unique
202      */
203     // NOTE: The way the table is built, we are guaranteed that the current value
204     // is "correct" (with no deltas needed) for at least one of the paths
205     // resulting from the row, EXCEPT in the very first row. Thus except for the first row,
206     // every row will have at least one cell with a zero deltaResult. That in turn means that
207     // if there is a deltaResult on a cell, at least one chain resulting from taking that cell
208     // will have that result.
209     // So, if there are any deltaResults in a row, it is not unique.
210     // Otherwise, if any of the cells have non-unique rows, it is not unique
211     // Otherwise it is unique.
212     // So this traverses the rows, setting the values for anything it hasn't seen.
213     // TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand,
214     // this might not be worth the time in adding them to a save list.
setUniqueValues(Row currentRow)215     private boolean setUniqueValues(Row currentRow) {
216         if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) {
217             return true;
218         }
219         // see if there is any reason to make us not Unique
220         // According to the structure above:
221         // It will be that we have a child that is not unique,
222         // or that we have a nonzero delta value
223         // We don't have to look at anything else.
224         // We DO have to call setUniqueValues on each of our children, to make sure they are set.
225         for (Cell cell : currentRow.byteToCell.values()) {
226             if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) {
227                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
228             } else if (cell.deltaResult != 0) {
229                 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS;
230             }
231         }
232         if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) {
233             return false;
234         }
235         currentRow.hasUniqueValue = Uniqueness.UNIQUE;
236         return true;
237     }
238 
239     static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() {
240 
241         @Override
242         public int compare(byte[] o1, byte[] o2) {
243             int minLen = o1.length;
244             if (minLen > o2.length) {
245                 minLen = o2.length;
246             }
247             for (int i = 0; i < minLen; ++i) {
248                 if (o1[i] != o2[i]) {
249                     return o1[i] < o2[i] ? -1 : 1; // return lesser first
250                 }
251             }
252             return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0;
253         }
254 
255     };
256 }