1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.Comparator; 12 import java.util.HashMap; 13 import java.util.Map; 14 import java.util.TreeMap; 15 16 import org.unicode.cldr.util.Dictionary.DictionaryBuilder; 17 import org.unicode.cldr.util.IntMap.BasicIntMapFactory; 18 import org.unicode.cldr.util.IntMap.IntMapFactory; 19 import org.unicode.cldr.util.StateDictionary.Cell; 20 import org.unicode.cldr.util.StateDictionary.Row; 21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness; 22 23 /** 24 * A simple state-table based dictionary builder. 25 * 26 * @author markdavis 27 * @param <T> 28 * the return type for the dictionary 29 */ 30 public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> { 31 32 private static final boolean SHOW_SIZE = true; 33 34 // only used while building 35 private Row buildingCurrentAddRow; 36 private int builtTotalBytes; 37 private int builtTotalStrings; 38 39 // results of build 40 private ArrayList<Row> builtRows; 41 private Row builtBaseRow; 42 private IntMap<T> builtResults; 43 private int builtMaxByteLength; 44 45 private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); // 46 // new ByteString(false); // new 47 // ByteString(true); // 48 49 private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<T>(); 50 51 /** 52 * Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory. 53 * 54 * @return 55 */ getIntMapFactory()56 public IntMapFactory<T> getIntMapFactory() { 57 return intMapFactory; 58 } 59 setIntMapFactory(IntMapFactory<T> intMapFactory)60 public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) { 61 this.intMapFactory = intMapFactory; 62 return this; 63 } 64 65 /** 66 * Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form: 67 * ByteString(true). 68 * 69 * @return 70 */ getByteConverter()71 public StringByteConverter getByteConverter() { 72 return byteConverter; 73 } 74 setByteConverter(StringByteConverter byteConverter)75 public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) { 76 this.byteConverter = byteConverter; 77 return this; 78 } 79 80 /** 81 * Create a new simple StateDictionary. This format is relatively fast to 82 * produce, and has a fair amount of compaction. The Map must be sorted 83 * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "". 84 * 85 * @param source 86 * @return 87 */ make(Map<CharSequence, T> source)88 public StateDictionary<T> make(Map<CharSequence, T> source) { 89 // clear out state 90 buildingCurrentAddRow = null; 91 builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0; 92 builtRows = new ArrayList<Row>(); 93 builtBaseRow = makeRow(); 94 builtResults = intMapFactory.make(source.values()); 95 if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage()); 96 97 Map<T, Integer> valueToInt = builtResults.getValueMap(); 98 Map<byte[], Integer> sorted = new TreeMap<byte[], Integer>(SHORTER_BYTE_ARRAY_COMPARATOR); 99 for (CharSequence text : source.keySet()) { 100 sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text))); 101 } 102 103 for (byte[] key : sorted.keySet()) { 104 addMapping(key, sorted.get(key)); 105 } 106 107 // now compact the rows 108 // first find out which rows are equivalent (recursively) 109 Map<Row, Row> replacements = new HashMap<Row, Row>(); 110 { 111 Map<Row, Row> equivalents = new TreeMap<Row, Row>(StateDictionary.rowComparator); 112 for (Row row : builtRows) { 113 Row cardinal = equivalents.get(row); 114 if (cardinal == null) { 115 equivalents.put(row, row); 116 } else { 117 replacements.put(row, cardinal); 118 } 119 } 120 } 121 if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size()); 122 123 // now replace all references to rows by their equivalents 124 for (Row row : builtRows) { 125 for (Byte key : row.byteToCell.keySet()) { 126 Cell cell = row.byteToCell.get(key); 127 Row newRow = replacements.get(cell.nextRow); 128 if (newRow != null) { 129 cell.nextRow = newRow; 130 } 131 } 132 } 133 // now compact the rows array 134 ArrayList<Row> newRows = new ArrayList<Row>(); 135 for (Row row : builtRows) { 136 if (!replacements.containsKey(row)) { 137 newRows.add(row); 138 } 139 } 140 // and set the Uniqueness values 141 setUniqueValues(builtBaseRow); 142 builtRows = newRows; 143 if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size()); 144 return new StateDictionary<T>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter); 145 } 146 makeRow()147 private Row makeRow() { 148 Row row = new Row(builtRows.size()); 149 builtRows.add(row); 150 return row; 151 } 152 addMapping(byte[] key, int result)153 private void addMapping(byte[] key, int result) { 154 buildingCurrentAddRow = builtBaseRow; 155 156 int lastIndex = key.length - 1; 157 for (int i = 0; i <= lastIndex; ++i) { 158 result = add(key[i], result, i == lastIndex); 159 } 160 builtTotalBytes += key.length; 161 builtTotalStrings += 1; 162 if (builtMaxByteLength < key.length) { 163 builtMaxByteLength = key.length; 164 } 165 } 166 add(byte key, int result, boolean last)167 private int add(byte key, int result, boolean last) { 168 Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key); 169 if (matchingCell != null) { 170 if (matchingCell.nextRow == null && !last) { 171 matchingCell.nextRow = makeRow(); 172 // we add a continuation, so decrement 173 --buildingCurrentAddRow.terminatingReturnCount; 174 } 175 buildingCurrentAddRow = matchingCell.nextRow; 176 return result - matchingCell.deltaResult; 177 } 178 Cell cell = new Cell(); 179 buildingCurrentAddRow.byteToCell.put(key, cell); 180 cell.deltaResult = result; 181 if (last) { 182 cell.returns = true; 183 ++buildingCurrentAddRow.returnCount; 184 ++buildingCurrentAddRow.terminatingReturnCount; 185 } else { 186 cell.nextRow = buildingCurrentAddRow = makeRow(); 187 } 188 // when we create a new cell for the first time, we deposit the 189 // result, so we can clear it now 190 return 0; 191 } 192 193 /** 194 * Follow all the path values, and determine whether all possible results from 195 * a row (when following bytes) are the same. If so, set the flag on the row. 196 * The return value 197 * 198 * @param savedMatchValue 199 * TODO 200 * @return true if unique 201 */ 202 // NOTE: The way the table is built, we are guaranteed that the current value 203 // is "correct" (with no deltas needed) for at least one of the paths 204 // resulting from the row, EXCEPT in the very first row. Thus except for the first row, 205 // every row will have at least one cell with a zero deltaResult. That in turn means that 206 // if there is a deltaResult on a cell, at least one chain resulting from taking that cell 207 // will have that result. 208 // So, if there are any deltaResults in a row, it is not unique. 209 // Otherwise, if any of the cells have non-unique rows, it is not unique 210 // Otherwise it is unique. 211 // So this traverses the rows, setting the values for anything it hasn't seen. 212 // TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand, 213 // this might not be worth the time in adding them to a save list. setUniqueValues(Row currentRow)214 private boolean setUniqueValues(Row currentRow) { 215 if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) { 216 return true; 217 } 218 // see if there is any reason to make us not Unique 219 // According to the structure above: 220 // It will be that we have a child that is not unique, 221 // or that we have a nonzero delta value 222 // We don't have to look at anything else. 223 // We DO have to call setUniqueValues on each of our children, to make sure they are set. 224 for (Cell cell : currentRow.byteToCell.values()) { 225 if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) { 226 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS; 227 } else if (cell.deltaResult != 0) { 228 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS; 229 } 230 } 231 if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) { 232 return false; 233 } 234 currentRow.hasUniqueValue = Uniqueness.UNIQUE; 235 return true; 236 } 237 238 static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() { 239 240 public int compare(byte[] o1, byte[] o2) { 241 int minLen = o1.length; 242 if (minLen > o2.length) { 243 minLen = o2.length; 244 } 245 for (int i = 0; i < minLen; ++i) { 246 if (o1[i] != o2[i]) { 247 return o1[i] < o2[i] ? -1 : 1; // return lesser first 248 } 249 } 250 return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0; 251 } 252 253 }; 254 }