1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.util.HashMap; 11 import java.util.Iterator; 12 import java.util.LinkedHashSet; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 18 import org.unicode.cldr.util.CharUtilities.CharSourceWrapper; 19 import org.unicode.cldr.util.Dictionary.Matcher; 20 import org.unicode.cldr.util.Dictionary.Matcher.Filter; 21 import org.unicode.cldr.util.Dictionary.Matcher.Status; 22 import org.unicode.cldr.util.SimpleDictionary.SimpleDictionaryBuilder; 23 24 import com.ibm.icu.lang.UCharacter; 25 import com.ibm.icu.text.DateFormat; 26 import com.ibm.icu.text.SimpleDateFormat; 27 import com.ibm.icu.text.UnicodeSet; 28 import com.ibm.icu.text.UnicodeSetIterator; 29 import com.ibm.icu.util.TimeZone; 30 import com.ibm.icu.util.ULocale; 31 32 /** 33 * Should be in the package usertest, but it's a pain to rename files in CVS. 34 * 35 * @author markdavis 36 * 37 * @param <T> 38 */ 39 public class TestStateDictionaryBuilder<T> { 40 private static final boolean SHORT_TEST = true; 41 42 private static final boolean SHOW_CONTENTS = true; 43 44 private static final boolean CHECK_BOOLEAN = false; 45 46 private final boolean SHOW_STATES = true; 47 48 boolean SIMPLE_ONLY = false; 49 50 boolean TEST_AGAINST_SIMPLE = true; 51 52 Dictionary<T> stateDictionary; 53 Dictionary.Matcher<T> stateMatcher; 54 55 Dictionary<T> simpleDictionary; 56 Dictionary.Matcher<T> simpleMatcher; 57 58 Map<CharSequence, T> baseMapping = new TreeMap<>(); 59 60 final StateDictionaryBuilder<T> stateDictionaryBuilder = new StateDictionaryBuilder<>(); 61 final SimpleDictionaryBuilder<T> simpleDictionaryBuilder = new SimpleDictionaryBuilder<>(); 62 63 // TODO: convert to TestFramework main(String[] args)64 public static void main(String[] args) { 65 66 try { 67 new TestStateDictionaryBuilder<String>().test(args); 68 } finally { 69 System.out.println("DONE"); 70 } 71 } 72 73 @SuppressWarnings({ "unchecked" }) test(String[] args)74 public void test(String[] args) { 75 76 for (String arg : args) { 77 if (arg.equalsIgnoreCase("utf8")) { 78 stateDictionaryBuilder.setByteConverter(new Utf8StringByteConverter()); 79 } else if (arg.equalsIgnoreCase("normal")) { 80 stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(false)); 81 } else if (arg.equalsIgnoreCase("compact")) { 82 stateDictionaryBuilder.setByteConverter(new CompactStringByteConverter(true)); 83 } 84 } 85 baseMapping.put("GMT+0000", (T) ("t")); 86 baseMapping.put("GMT+0100", (T) ("t")); 87 baseMapping.put("GMT+0200", (T) ("t")); 88 baseMapping.put("GMT+0300", (T) ("t")); 89 baseMapping.put("GMT+0307", (T) ("t")); 90 showDictionaryContents(); 91 92 addToBoth("man", 1); 93 addToBoth("manner", 100); 94 addToBoth("many", 10); 95 addToBoth("any", 83); 96 showDictionaryContents(); 97 98 baseMapping.put("man", (T) "Woman"); 99 baseMapping.put("many", (T) "Few"); 100 baseMapping.put("any", (T) "All"); 101 showDictionaryContents(); 102 103 for (Filter filter : Filter.values()) { 104 final String string = "many manners ma"; 105 tryFind(string, new CharSourceWrapper<CharSequence>(string), stateDictionary, filter); 106 } 107 108 showWords("ma"); 109 showWords("ma!"); 110 showWords("!ma"); 111 showWords("man"); 112 showWords("man!"); 113 showWords("mann"); 114 showWords("mann!"); 115 showWords("many"); 116 showWords("many!"); 117 compare(); 118 119 addToBoth("m\u03B1nner", 1000); 120 showDictionaryContents(); 121 showWords("m\u03B1"); 122 compare(); 123 124 // if (true) return; 125 // clear out 126 127 addToBoth("fish", 10); 128 showDictionaryContents(); 129 showWords("a fisherman"); 130 compare(); 131 132 addToBoth("fisher", 13); 133 showDictionaryContents(); 134 showWords("a fisherman"); 135 compare(); 136 137 addToBoth("her", 55); 138 showDictionaryContents(); 139 showWords("a fisherman"); 140 compare(); 141 142 // clear out 143 144 // check some non-latin 145 String[] zoneIDs = TimeZone.getAvailableIDs(); 146 SimpleDateFormat dt = (SimpleDateFormat) DateFormat.getDateInstance(DateFormat.LONG, new ULocale("hi")); 147 dt.applyPattern("vvvv"); 148 for (String zoneID : zoneIDs) { 149 TimeZone zone = TimeZone.getTimeZone(zoneID); 150 dt.setTimeZone(zone); 151 String zoneName = dt.format(0); 152 addToBoth(zoneName, (T) (CHECK_BOOLEAN ? "t" : zoneID)); 153 } 154 compare(); 155 showDictionaryContents(); 156 ((StateDictionary<T>) stateDictionary).flatten(); 157 158 if (SIMPLE_ONLY) { 159 testWithUnicodeNames(); 160 161 ((StateDictionary<T>) stateDictionary).flatten(); 162 compare(); 163 System.out.println(); 164 showDictionaryContents(); 165 } 166 167 } 168 tryFind(CharSequence originalText, CharSource charListText, Dictionary<U> dictionary, Filter filter)169 static public <U> void tryFind(CharSequence originalText, CharSource charListText, Dictionary<U> dictionary, 170 Filter filter) { 171 System.out.println("Using dictionary: " 172 + Dictionary.load(dictionary.getMapping(), new TreeMap<CharSequence, U>())); 173 System.out.println("Searching in: {" + originalText + "} with filter=" + filter); 174 // Dictionaries are immutable, so we create a Matcher to search/test text. 175 Matcher<U> matcher = dictionary.getMatcher(); 176 matcher.setText(charListText); 177 while (true) { 178 Status status = matcher.find(filter); 179 String unique = ""; // only set if needed 180 if (status == Status.NONE) { 181 break; 182 } else if (status == Status.PARTIAL) { 183 // sets the match value to the "first" partial match 184 if (matcher.nextUniquePartial()) { 185 unique = "\tUnique"; 186 } else { 187 unique = "\tNot Unique"; 188 } 189 } 190 // Show results 191 System.out.println("{" 192 + showBoth(charListText, 0, matcher.getOffset()) + "[[" 193 + showBoth(charListText, matcher.getOffset(), matcher.getMatchEnd()) 194 + "]]" + showBoth(charListText, matcher.getMatchEnd(), charListText.getKnownLength()) 195 + "}\t" + status + " \t{" + matcher.getMatchValue() + "}\t" + unique); 196 } 197 System.out.println(); 198 } 199 showBoth(CharSource source, int start, int end)200 static public CharSequence showBoth(CharSource source, int start, int end) { 201 if (source instanceof CharSourceWrapper) { 202 CharSourceWrapper new_name = (CharSourceWrapper) source; 203 return new_name.sourceSubSequence(start, end); 204 } 205 return source.subSequence(start, end); 206 } 207 showDictionaryContents()208 private void showDictionaryContents() { 209 // build stuff to use from now on 210 simpleDictionary = simpleDictionaryBuilder.make(baseMapping); 211 simpleMatcher = simpleDictionary.getMatcher(); 212 stateDictionary = stateDictionaryBuilder.make(baseMapping); 213 stateMatcher = stateDictionary.getMatcher(); 214 baseMapping.clear(); 215 216 // ((Dictionary.Builder) simpleDictionary).addMapping(string, i); 217 // ((Dictionary.Builder) stateDictionary).addMapping(string, i); 218 219 System.out.println("Dictionary: " 220 + Dictionary.load(stateDictionary.getMapping(), new TreeMap<CharSequence, T>())); 221 System.out.println(); 222 if (SHOW_STATES) { 223 System.out.println("States:" + CldrUtility.LINE_SEPARATOR + stateDictionary); 224 System.out.println(); 225 } 226 if (SHOW_CONTENTS) { 227 System.out.println("Structure:" + CldrUtility.LINE_SEPARATOR + stateDictionary.debugShow()); 228 System.out.println(); 229 } 230 } 231 232 @SuppressWarnings("unchecked") testWithUnicodeNames()233 private void testWithUnicodeNames() { 234 UnicodeSet testSet = new UnicodeSet( 235 "[[:assigned:] - [:ideographic:] - [:Co:] - [:Cs:]]"); // & 236 // [\\u0000-\\u0FFF] 237 int count = 0; 238 Map<String, T> data = new TreeMap<>(); 239 for (UnicodeSetIterator it = new UnicodeSetIterator(testSet); it.next();) { 240 String name = UCharacter.getExtendedName(it.codepoint); 241 if (name == null) { 242 continue; 243 } 244 if ((++count & 0xFF) == 0) { 245 System.out.println(count + ":\t" 246 + com.ibm.icu.impl.Utility.hex(it.codepoint) + "\t" + name); 247 } 248 data.put(name, (T) com.ibm.icu.impl.Utility.hex(it.codepoint, 4)); 249 } 250 count = 0; 251 for (String item : data.keySet()) { 252 if (SHORT_TEST && count++ > 500) continue; // 253 addToBoth(item, data.get(item)); 254 } 255 simpleDictionary = simpleDictionaryBuilder.make(baseMapping); 256 stateDictionary = stateDictionaryBuilder.make(baseMapping); 257 baseMapping.clear(); 258 compare(); 259 } 260 compare()261 private void compare() { 262 System.out.println("Comparing results: "); 263 264 Map<CharSequence, T> dictionaryData = Dictionary.load(stateDictionary.getMapping(), 265 new HashMap<CharSequence, T>()); 266 Map<CharSequence, T> simpleDictionaryData = Dictionary.load(simpleDictionary.getMapping(), 267 new HashMap<CharSequence, T>()); 268 269 assert dictionaryData.equals(simpleDictionaryData) : showDifference(dictionaryData, simpleDictionaryData); 270 if (SHOW_STATES) { 271 System.out.println("Size: " + dictionaryData.size()); 272 System.out.println("Rows: " 273 + ((StateDictionary<T>) stateDictionary).getRowCount()); 274 } 275 276 System.out.println("Checking values: state dictionary"); 277 checkSimpleMatches(stateMatcher, dictionaryData); 278 System.out.println("Checking values: simple dictionary"); 279 checkSimpleMatches(simpleMatcher, simpleDictionaryData); 280 int count = 0; 281 System.out.println("Cross-checking all values"); 282 for (CharSequence myText : simpleDictionaryData.keySet()) { 283 if ((++count & 0xFF) == 0xFF) { 284 System.out.println(count + ":\t" + myText); 285 } 286 crossCheck(new CharSourceWrapper<>(myText)); 287 crossCheck("!" + myText); 288 crossCheck(myText + "!"); 289 } 290 } 291 showDifference(Map<CharSequence, T> dictionaryData, Map<CharSequence, T> simpleDictionaryData)292 private String showDifference(Map<CharSequence, T> dictionaryData, Map<CharSequence, T> simpleDictionaryData) { 293 System.out.println(dictionaryData.size() + ", " + simpleDictionaryData.size()); 294 Iterator<Entry<CharSequence, T>> it1 = dictionaryData.entrySet().iterator(); 295 Iterator<Entry<CharSequence, T>> it2 = simpleDictionaryData.entrySet().iterator(); 296 while (it1.hasNext() || it2.hasNext()) { 297 Entry<CharSequence, T> item1 = it1.hasNext() ? it1.next() : null; 298 Entry<CharSequence, T> item2 = it2.hasNext() ? it2.next() : null; 299 System.out.println(item1 + ", " + item2); 300 if (item1 == null || item2 == null || !item1.equals(item2)) { 301 return item1 + "!=" + item2; 302 } 303 } 304 return "no difference"; 305 } 306 crossCheck(CharSequence myText)307 private void crossCheck(CharSequence myText) { 308 crossCheck(new CharSourceWrapper<>(myText)); 309 } 310 crossCheck(CharSource myText)311 private void crossCheck(CharSource myText) { 312 stateMatcher.setText(myText); // set the text to operate on 313 simpleMatcher.setText(myText); // set the text to operate on 314 for (int i = 0; stateMatcher.getText().hasCharAt(i); ++i) { 315 stateMatcher.setOffset(i); 316 simpleMatcher.setOffset(i); 317 while (true) { 318 Status stateStatus = stateMatcher.next(); 319 Status simpleStatus = simpleMatcher.next(); 320 assert stateStatus == simpleStatus : showValues(stateStatus, simpleStatus); 321 final int stateEnd = stateMatcher.getMatchEnd(); 322 final int simpleEnd = simpleMatcher.getMatchEnd(); 323 assert stateEnd == simpleEnd : showValues(stateStatus, simpleStatus); 324 if (stateStatus == Status.PARTIAL) { 325 boolean stateUnique = stateMatcher.nextUniquePartial(); 326 boolean simpleUnique = simpleMatcher.nextUniquePartial(); 327 assert stateUnique == simpleUnique : showValues(stateStatus, simpleStatus); 328 } 329 // test this after checking PARTIAL 330 assert stateMatcher.getMatchValue() == simpleMatcher.getMatchValue() : showValues(stateStatus, 331 simpleStatus); 332 if (stateStatus != Status.MATCH) { 333 break; 334 } 335 } 336 } 337 } 338 showValues(Status stateStatus, Status simpleStatus)339 private String showValues(Status stateStatus, Status simpleStatus) { 340 return CldrUtility.LINE_SEPARATOR + "TEXT:\t" + stateMatcher.text + CldrUtility.LINE_SEPARATOR + "STATE:\t" 341 + showValues(stateStatus, stateMatcher) + CldrUtility.LINE_SEPARATOR + "SIMPLE:\t" 342 + showValues(simpleStatus, simpleMatcher); 343 } 344 showValues(Status status, Matcher<T> matcher)345 private String showValues(Status status, Matcher<T> matcher) { 346 boolean uniquePartial = status == Status.PARTIAL && matcher.nextUniquePartial(); // sets matchValue for PARTIAL 347 return String.format("\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s %s", 348 matcher.getOffset(), 349 matcher.getMatchEnd(), 350 status, 351 matcher.getMatchText(), 352 matcher.getMatchValue(), 353 status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : ""); 354 } 355 356 /** 357 * Check that the words all match against themselves. 358 * 359 * @param matcher 360 * @param data 361 */ checkSimpleMatches(Matcher<T> matcher, Map<CharSequence, T> data)362 private void checkSimpleMatches(Matcher<T> matcher, Map<CharSequence, T> data) { 363 int count = 0; 364 for (CharSequence myText : data.keySet()) { 365 if ((count++ & 0xFF) == 0xFF) { 366 System.out.println(count + ":\t" + myText); 367 } 368 matcher.setText(myText); // set the text to operate on 369 370 matcher.setOffset(0); 371 int matchEnd = -1; 372 T matchValue = null; 373 // find the longest match 374 while (true) { 375 Dictionary.Matcher.Status next1 = matcher.next(); 376 if (next1 == Dictionary.Matcher.Status.MATCH) { 377 matchEnd = matcher.getMatchEnd(); 378 matchValue = matcher.getMatchValue(); 379 } else { 380 break; 381 } 382 } 383 assert matchEnd == myText.length() : "failed to find end of <" + myText + "> got instead " + matchEnd; 384 assert matchValue == data.get(myText); 385 } 386 } 387 388 @SuppressWarnings("unchecked") addToBoth(CharSequence string, int i)389 private void addToBoth(CharSequence string, int i) { 390 baseMapping.put(string, (T) (i + "/" + string)); 391 } 392 addToBoth(CharSequence string, T i)393 private void addToBoth(CharSequence string, T i) { 394 baseMapping.put(string, i); 395 // if (simpleDictionary.contains(string)) return; 396 // if (!stateDictionary.contains(string)) { 397 // stateDictionary.contains(string); 398 // } 399 // assert stateDictionary.contains(string); 400 } 401 showWords(String myText)402 private void showWords(String myText) { 403 System.out.format("Finding words in: \"%s\"" + CldrUtility.LINE_SEPARATOR, myText); 404 if (SIMPLE_ONLY) { 405 showWords("", simpleMatcher, myText); 406 } else { 407 Set<String> simpleResult = showWords("Simple", simpleMatcher, myText); 408 Set<String> stateResult = showWords("STATE", stateMatcher, myText); 409 if (!simpleResult.equals(stateResult)) { 410 // repeat, for debugging 411 System.out.println(" DIFFERENCE"); 412 showWords("Simple", simpleMatcher, myText); 413 showWords("STATE", stateMatcher, myText); 414 Set<String> simpleMinusState = new LinkedHashSet<>(simpleResult); 415 simpleMinusState.removeAll(stateResult); 416 System.out.println("Simple-State" + simpleMinusState); 417 Set<String> stateMinusSimple = new LinkedHashSet<>(stateResult); 418 stateMinusSimple.removeAll(simpleResult); 419 System.out.println("State-Simple" + stateMinusSimple); 420 } 421 } 422 } 423 showWords(String title, Matcher<T> matcher, String myText)424 private Set<String> showWords(String title, Matcher<T> matcher, String myText) { 425 title = title.equals("") ? "" : "\tType: " + title; 426 // Walk through a strings and gather information about what we find 427 // according to the matcher 428 Set<String> result = new LinkedHashSet<>(); 429 // Set the text to operate on 430 matcher.setText(myText); 431 boolean uniquePartial = false; 432 for (int i = 0; matcher.hasCharAt(i); ++i) { 433 matcher.setOffset(i); 434 Status status; 435 // We might get multiple matches at each point, so walk through all of 436 // them. The last one might be a partial, so collect some extra 437 // information in that case. 438 do { 439 // Sets matchValue if there is a MATCH 440 status = matcher.next(); 441 if (status == Status.PARTIAL) { 442 // Sets matchValue if the next() status was PARTIAL 443 uniquePartial = matcher.nextUniquePartial(); 444 } 445 // Format all of the information 446 String info = String.format( 447 "\tOffsets: %s,%s\tStatus: %s\tString: \"%s\"\tValue: %s%s", // 448 matcher.getOffset(), matcher.getMatchEnd(), status, // 449 matcher.getMatchText(), matcher.getMatchValue(), // 450 status == Status.PARTIAL && uniquePartial ? "\tUNIQUE" : ""); 451 result.add(info); 452 if (status != Status.NONE) { 453 // If there was a match or partial match, show what we got 454 System.out.println(title + info); 455 } 456 } while (status == Status.MATCH); 457 } 458 return result; 459 } 460 }