1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.ArrayList; 11 import java.util.Comparator; 12 import java.util.HashMap; 13 import java.util.Map; 14 import java.util.TreeMap; 15 16 import org.unicode.cldr.util.Dictionary.DictionaryBuilder; 17 import org.unicode.cldr.util.IntMap.BasicIntMapFactory; 18 import org.unicode.cldr.util.IntMap.IntMapFactory; 19 import org.unicode.cldr.util.StateDictionary.Cell; 20 import org.unicode.cldr.util.StateDictionary.Row; 21 import org.unicode.cldr.util.StateDictionary.Row.Uniqueness; 22 23 /** 24 * A simple state-table based dictionary builder. 25 * 26 * @author markdavis 27 * @param <T> 28 * the return type for the dictionary 29 */ 30 public class StateDictionaryBuilder<T> implements DictionaryBuilder<T> { 31 32 private static final boolean SHOW_SIZE = true; 33 34 // only used while building 35 private Row buildingCurrentAddRow; 36 private int builtTotalBytes; 37 private int builtTotalStrings; 38 39 // results of build 40 private ArrayList<Row> builtRows; 41 private Row builtBaseRow; 42 private IntMap<T> builtResults; 43 private int builtMaxByteLength; 44 45 private StringByteConverter byteConverter = new CompactStringByteConverter(true); // new StringUtf8Converter(); // 46 // new ByteString(false); // new 47 // ByteString(true); // 48 49 private IntMapFactory<T> intMapFactory = new BasicIntMapFactory<>(); 50 51 /** 52 * Get/set the IntMapFactory used to store the values for T. The default is BasicIntMapFactory. 53 * 54 * @return 55 */ getIntMapFactory()56 public IntMapFactory<T> getIntMapFactory() { 57 return intMapFactory; 58 } 59 setIntMapFactory(IntMapFactory<T> intMapFactory)60 public StateDictionaryBuilder<T> setIntMapFactory(IntMapFactory<T> intMapFactory) { 61 this.intMapFactory = intMapFactory; 62 return this; 63 } 64 65 /** 66 * Get/Set the StringByteConverter used to convert strings to bytes and back. The default is a compacted form: 67 * ByteString(true). 68 * 69 * @return 70 */ getByteConverter()71 public StringByteConverter getByteConverter() { 72 return byteConverter; 73 } 74 setByteConverter(StringByteConverter byteConverter)75 public StateDictionaryBuilder<T> setByteConverter(StringByteConverter byteConverter) { 76 this.byteConverter = byteConverter; 77 return this; 78 } 79 80 /** 81 * Create a new simple StateDictionary. This format is relatively fast to 82 * produce, and has a fair amount of compaction. The Map must be sorted 83 * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "". 84 * 85 * @param source 86 * @return 87 */ 88 @Override make(Map<CharSequence, T> source)89 public StateDictionary<T> make(Map<CharSequence, T> source) { 90 // clear out state 91 buildingCurrentAddRow = null; 92 builtTotalBytes = builtTotalStrings = builtMaxByteLength = 0; 93 builtRows = new ArrayList<>(); 94 builtBaseRow = makeRow(); 95 builtResults = intMapFactory.make(source.values()); 96 if (SHOW_SIZE) System.out.println("***VALUE STORAGE: " + builtResults.approximateStorage()); 97 98 Map<T, Integer> valueToInt = builtResults.getValueMap(); 99 Map<byte[], Integer> sorted = new TreeMap<>(SHORTER_BYTE_ARRAY_COMPARATOR); 100 for (CharSequence text : source.keySet()) { 101 sorted.put(byteConverter.toBytes(text), valueToInt.get(source.get(text))); 102 } 103 104 for (byte[] key : sorted.keySet()) { 105 addMapping(key, sorted.get(key)); 106 } 107 108 // now compact the rows 109 // first find out which rows are equivalent (recursively) 110 Map<Row, Row> replacements = new HashMap<>(); 111 { 112 Map<Row, Row> equivalents = new TreeMap<>(StateDictionary.rowComparator); 113 for (Row row : builtRows) { 114 Row cardinal = equivalents.get(row); 115 if (cardinal == null) { 116 equivalents.put(row, row); 117 } else { 118 replacements.put(row, cardinal); 119 } 120 } 121 } 122 if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size() + "\t REPLACEMENTS: " + replacements.size()); 123 124 // now replace all references to rows by their equivalents 125 for (Row row : builtRows) { 126 for (Byte key : row.byteToCell.keySet()) { 127 Cell cell = row.byteToCell.get(key); 128 Row newRow = replacements.get(cell.nextRow); 129 if (newRow != null) { 130 cell.nextRow = newRow; 131 } 132 } 133 } 134 // now compact the rows array 135 ArrayList<Row> newRows = new ArrayList<>(); 136 for (Row row : builtRows) { 137 if (!replacements.containsKey(row)) { 138 newRows.add(row); 139 } 140 } 141 // and set the Uniqueness values 142 setUniqueValues(builtBaseRow); 143 builtRows = newRows; 144 if (SHOW_SIZE) System.out.println("***ROWS: " + builtRows.size()); 145 return new StateDictionary<>(builtBaseRow, builtRows, builtResults, builtMaxByteLength, byteConverter); 146 } 147 makeRow()148 private Row makeRow() { 149 Row row = new Row(builtRows.size()); 150 builtRows.add(row); 151 return row; 152 } 153 addMapping(byte[] key, int result)154 private void addMapping(byte[] key, int result) { 155 buildingCurrentAddRow = builtBaseRow; 156 157 int lastIndex = key.length - 1; 158 for (int i = 0; i <= lastIndex; ++i) { 159 result = add(key[i], result, i == lastIndex); 160 } 161 builtTotalBytes += key.length; 162 builtTotalStrings += 1; 163 if (builtMaxByteLength < key.length) { 164 builtMaxByteLength = key.length; 165 } 166 } 167 add(byte key, int result, boolean last)168 private int add(byte key, int result, boolean last) { 169 Cell matchingCell = buildingCurrentAddRow.byteToCell.get(key); 170 if (matchingCell != null) { 171 if (matchingCell.nextRow == null && !last) { 172 matchingCell.nextRow = makeRow(); 173 // we add a continuation, so decrement 174 --buildingCurrentAddRow.terminatingReturnCount; 175 } 176 buildingCurrentAddRow = matchingCell.nextRow; 177 return result - matchingCell.deltaResult; 178 } 179 Cell cell = new Cell(); 180 buildingCurrentAddRow.byteToCell.put(key, cell); 181 cell.deltaResult = result; 182 if (last) { 183 cell.returns = true; 184 ++buildingCurrentAddRow.returnCount; 185 ++buildingCurrentAddRow.terminatingReturnCount; 186 } else { 187 cell.nextRow = buildingCurrentAddRow = makeRow(); 188 } 189 // when we create a new cell for the first time, we deposit the 190 // result, so we can clear it now 191 return 0; 192 } 193 194 /** 195 * Follow all the path values, and determine whether all possible results from 196 * a row (when following bytes) are the same. If so, set the flag on the row. 197 * The return value 198 * 199 * @param savedMatchValue 200 * TODO 201 * @return true if unique 202 */ 203 // NOTE: The way the table is built, we are guaranteed that the current value 204 // is "correct" (with no deltas needed) for at least one of the paths 205 // resulting from the row, EXCEPT in the very first row. Thus except for the first row, 206 // every row will have at least one cell with a zero deltaResult. That in turn means that 207 // if there is a deltaResult on a cell, at least one chain resulting from taking that cell 208 // will have that result. 209 // So, if there are any deltaResults in a row, it is not unique. 210 // Otherwise, if any of the cells have non-unique rows, it is not unique 211 // Otherwise it is unique. 212 // So this traverses the rows, setting the values for anything it hasn't seen. 213 // TODO: could be optimized (maybe) by saving the rows seen so far. On the other hand, 214 // this might not be worth the time in adding them to a save list. setUniqueValues(Row currentRow)215 private boolean setUniqueValues(Row currentRow) { 216 if (currentRow.hasUniqueValue == Uniqueness.UNIQUE) { 217 return true; 218 } 219 // see if there is any reason to make us not Unique 220 // According to the structure above: 221 // It will be that we have a child that is not unique, 222 // or that we have a nonzero delta value 223 // We don't have to look at anything else. 224 // We DO have to call setUniqueValues on each of our children, to make sure they are set. 225 for (Cell cell : currentRow.byteToCell.values()) { 226 if (cell.nextRow != null && !setUniqueValues(cell.nextRow)) { 227 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS; 228 } else if (cell.deltaResult != 0) { 229 currentRow.hasUniqueValue = Uniqueness.AMBIGUOUS; 230 } 231 } 232 if (currentRow.hasUniqueValue == Uniqueness.AMBIGUOUS) { 233 return false; 234 } 235 currentRow.hasUniqueValue = Uniqueness.UNIQUE; 236 return true; 237 } 238 239 static final Comparator<byte[]> SHORTER_BYTE_ARRAY_COMPARATOR = new Comparator<byte[]>() { 240 241 @Override 242 public int compare(byte[] o1, byte[] o2) { 243 int minLen = o1.length; 244 if (minLen > o2.length) { 245 minLen = o2.length; 246 } 247 for (int i = 0; i < minLen; ++i) { 248 if (o1[i] != o2[i]) { 249 return o1[i] < o2[i] ? -1 : 1; // return lesser first 250 } 251 } 252 return o1.length < o2.length ? -1 : o1.length > o2.length ? 1 : 0; 253 } 254 255 }; 256 }