1 package org.unicode.cldr.icu; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.FileNotFoundException; 6 import java.io.IOException; 7 import java.io.Reader; 8 import java.util.ArrayList; 9 import java.util.Collections; 10 import java.util.Comparator; 11 import java.util.List; 12 import java.util.Set; 13 import java.util.TreeSet; 14 15 import org.unicode.cldr.draft.FileUtilities; 16 import org.unicode.cldr.tool.Option.Options; 17 18 import com.ibm.icu.impl.Row; 19 import com.ibm.icu.impl.Row.R2; 20 import com.ibm.icu.impl.Utility; 21 import com.ibm.icu.lang.UCharacter; 22 import com.ibm.icu.text.UForwardCharacterIterator; 23 import com.ibm.icu.text.UTF16; 24 import com.ibm.icu.text.UnicodeSet; 25 26 /** 27 * Compares the contents of ICU data output while ignoring comments. 28 * 29 * @author markdavis, jchye 30 * 31 */ 32 public class CompareIcuOutput { 33 private static final boolean DEBUG = false; 34 35 private static final Options options = new Options( 36 "Usage: RBChecker [OPTIONS] DIR1 DIR2 FILE_REGEX\n" + 37 "This program is used to compare the RB text files in two different directories.\n" + 38 " Example: org.unicode.cldr.icu.RBChecker olddatadir newdatadir .*") 39 .add("sort", 's', null, null, "Sort values for comparison"); 40 41 private static final Comparator<String[]> comparator = new Comparator<String[]>() { 42 @Override 43 public int compare(String[] arg0, String[] arg1) { 44 return arg0[0].compareTo(arg1[0]); 45 } 46 }; 47 48 private static boolean shouldSort = false; 49 main(String[] args)50 public static void main(String[] args) throws IOException { 51 String dir1 = args[0]; 52 String dir2 = args[1]; 53 String regex = args[2]; 54 System.out.println("dir1 " + dir1); 55 System.out.println("dir2 " + dir2); 56 System.out.println("regex " + regex); 57 shouldSort = options.get("sort").doesOccur(); 58 long totaltime = System.currentTimeMillis(); 59 System.out.println("Comparing the contents of text files..."); 60 compareTextFiles(dir1, dir2, regex); 61 System.out.println("Total time taken: " + (System.currentTimeMillis() - totaltime)); 62 } 63 64 /** 65 * Parses and compares two ICU textfiles. 66 * 67 * @param dir1 68 * @param dir2 69 * @param regex 70 * @throws IOException 71 */ compareTextFiles(String dir1, String dir2, String regex)72 private static void compareTextFiles(String dir1, String dir2, String regex) throws IOException { 73 File localeDir = new File(dir1); 74 if (!localeDir.exists()) localeDir = new File(dir1); 75 String[] filenames = localeDir.list(); 76 int same = 0, different = 0; 77 for (String filename : filenames) { 78 if (!filename.matches(regex + "\\.txt")) continue; 79 String locale = filename.substring(0, filename.length() - 4); 80 try { 81 IcuData oldData = loadDataFromTextfiles(dir1, locale); 82 IcuData newData = loadDataFromTextfiles(dir2, locale); 83 StringBuffer messages = new StringBuffer(); 84 if (analyseMatches(oldData, newData, messages)) { 85 System.out.println("=== Differences found for " + locale + " ==="); 86 System.out.print(messages); 87 different++; 88 } else { 89 same++; 90 } 91 } catch (FileNotFoundException e) { 92 System.err.println(locale + " file not found, skipping"); 93 } 94 } 95 System.out.println("Check finished with " + different + " different and " + same + " same locales."); 96 } 97 loadDataFromTextfiles(String icuPath, String locale)98 private static IcuData loadDataFromTextfiles(String icuPath, String locale) throws IOException { 99 List<Row.R2<MyTokenizer.Type, String>> comments = new ArrayList<Row.R2<MyTokenizer.Type, String>>(); 100 IcuData icuData = new IcuData(locale + ".xml", locale, true); 101 String filename = icuPath + '/' + locale + ".txt"; 102 if (new File(filename).exists()) { 103 parseRB(filename, icuData, comments); 104 } else { 105 throw new FileNotFoundException(filename + " does not exist."); 106 } 107 return icuData; 108 } 109 110 /** 111 * Computes lists of all differences between two sets of IcuData. 112 * 113 * @param oldData 114 * @param newData 115 */ analyseMatches(IcuData oldData, IcuData newData, StringBuffer buffer)116 private static boolean analyseMatches(IcuData oldData, IcuData newData, StringBuffer buffer) { 117 boolean hasDifferences = false; 118 Set<String> missing = new TreeSet<String>(oldData.keySet()); 119 missing.removeAll(newData.keySet()); 120 if (missing.size() > 0) { 121 buffer.append("Missing paths:\n"); 122 printAllInSet(oldData, missing, buffer); 123 hasDifferences = true; 124 } 125 Set<String> extra = new TreeSet<String>(newData.keySet()); 126 extra.removeAll(oldData.keySet()); 127 if (extra.size() > 0) { 128 buffer.append("Extra paths:\n"); 129 printAllInSet(newData, extra, buffer); 130 hasDifferences = true; 131 } 132 Set<String> common = new TreeSet<String>(oldData.keySet()); 133 common.retainAll(newData.keySet()); 134 for (String rbPath : common) { 135 if (rbPath.startsWith("/Version")) continue; // skip version 136 List<String[]> oldValues = oldData.get(rbPath); 137 List<String[]> newValues = newData.get(rbPath); 138 if (shouldSort) { 139 Collections.sort(oldValues, comparator); 140 Collections.sort(newValues, comparator); 141 } 142 // Print out any value differences. 143 if (valuesDiffer(oldValues, newValues)) { 144 buffer.append(rbPath + " contains differences:\n"); 145 buffer.append("\tOld: "); 146 printValues(oldValues, buffer); 147 buffer.append("\tNew: "); 148 printValues(newValues, buffer); 149 hasDifferences = true; 150 } 151 } 152 return hasDifferences; 153 } 154 printAllInSet(IcuData icuData, Set<String> paths, StringBuffer buffer)155 private static void printAllInSet(IcuData icuData, Set<String> paths, StringBuffer buffer) { 156 for (String path : paths) { 157 buffer.append("\t" + path + " = "); 158 printValues(icuData.get(path), buffer); 159 } 160 } 161 printValues(List<String[]> values, StringBuffer buffer)162 private static void printValues(List<String[]> values, StringBuffer buffer) { 163 // Enclose both numbers and strings in quotes for simplicity. 164 for (String[] array : values) { 165 if (array.length == 1) { 166 buffer.append('"' + array[0] + '"'); 167 } else { 168 buffer.append("["); 169 for (String value : array) { 170 buffer.append('"' + value + "\", "); 171 } 172 buffer.append("]"); 173 } 174 buffer.append(", "); 175 } 176 buffer.append('\n'); 177 } 178 179 /** 180 * @param oldValues 181 * @param newValues 182 * @return true if the contents of the lists are identical 183 */ valuesDiffer(List<String[]> oldValues, List<String[]> newValues)184 private static boolean valuesDiffer(List<String[]> oldValues, List<String[]> newValues) { 185 if (oldValues.size() != newValues.size()) return true; 186 boolean differ = false; 187 for (int i = 0; i < oldValues.size(); i++) { 188 String[] oldArray = oldValues.get(i); 189 String[] newArray = newValues.get(i); 190 if (oldArray.length != newArray.length) { 191 differ = true; 192 break; 193 } 194 for (int j = 0; j < oldArray.length; j++) { 195 // Ignore whitespace. 196 if (!oldArray[j].replace(" ", "").equals(newArray[j].replace(" ", ""))) { 197 differ = true; 198 break; 199 } 200 } 201 } 202 return differ; 203 } 204 205 /** 206 * Parse an ICU resource bundle into key,value items 207 * 208 * @param filename 209 * @param output 210 * @param comments 211 */ parseRB(String filename, IcuData icuData, List<R2<MyTokenizer.Type, String>> comments)212 static void parseRB(String filename, IcuData icuData, List<R2<MyTokenizer.Type, String>> comments) 213 throws IOException { 214 BufferedReader in = null; 215 File file = new File(filename); 216 String coreFile = file.getName(); 217 if (!coreFile.endsWith(".txt")) { 218 throw new IllegalArgumentException("missing .txt in: " + filename); 219 } 220 coreFile = coreFile.substring(0, coreFile.length() - 4); 221 // redo this later on to use fixed PatternTokenizer 222 in = FileUtilities.openUTF8Reader("", filename); 223 MyTokenizer tokenIterator = new MyTokenizer(in); 224 StringBuffer tokenText = new StringBuffer(); 225 List<String> oldPaths = new ArrayList<String>(); 226 List<Integer> indices = new ArrayList<Integer>(); 227 String lastLabel = null; 228 String path = ""; 229 /* 230 * AuxExemplarCharacters{ 231 * "[á à ă â å ä ã ā æ ç é è ĕ ê ë ē í ì ĭ î ï ī ñ ó ò ŏ ô ö ø ō œ ú ù ŭ û ü ū ÿ" 232 * "]" } ExemplarCharacters{ 233 * "[a b c d e f g h i j k l m n o p q r s t u v w x y z]"} 234 * ExemplarCharactersCurrency 235 * {"[a b c č d e f g h i j k l ł m n o º p q r s t u v w x y z]"} 236 * ExemplarCharactersIndex 237 * {"[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]"} 238 * ExemplarCharactersPunctuation{"[\- ‐ – — , ; \: ! ? . … ' ‘ ’ \" 239 * “ ” ( ) \[ \] @ * / \& # † ‡ ′ ″ §]"} 240 */ 241 MyTokenizer.Type lastToken = null; 242 List<String> arrayValues = null; 243 while (true) { 244 MyTokenizer.Type nextToken = tokenIterator.next(tokenText); 245 if (DEBUG) 246 System.out.println(nextToken + "\t" + tokenText); 247 switch (nextToken) { 248 case BLOCK_COMMENT: 249 case LINE_COMMENT: 250 if (comments != null) { 251 comments.add(Row.of(nextToken, tokenText.toString())); 252 } 253 continue; 254 case DONE: 255 if (oldPaths.size() != 0) { 256 throw new IllegalArgumentException("missing }"); 257 } 258 in.close(); 259 return; 260 case ID: 261 lastLabel = lastLabel == null ? tokenText.toString() : lastLabel + " " + tokenText; 262 break; 263 case QUOTED: 264 if (lastLabel == null) { 265 lastLabel = tokenText.toString(); 266 } else { 267 // Remove consecutive quotes. 268 lastLabel += tokenText; 269 } 270 break; 271 case OPEN_BRACE: 272 // Check for array-type values. 273 if (lastToken == MyTokenizer.Type.COMMA) { 274 arrayValues = new ArrayList<String>(); 275 } else { 276 oldPaths.add(path); 277 indices.add(0); 278 if (lastToken == MyTokenizer.Type.OPEN_BRACE || lastToken == MyTokenizer.Type.CLOSE_BRACE) { 279 int currentIndexPos = indices.size() - 2; 280 int currentIndex = indices.get(currentIndexPos); 281 lastLabel = "<" + currentIndex + ">"; 282 indices.set(currentIndexPos, currentIndex + 1); 283 } else if (lastLabel.contains(":") && !lastLabel.contains(":int") && !lastLabel.contains(":alias") 284 || path.endsWith("/relative")) { 285 lastLabel = '"' + lastLabel + '"'; 286 } 287 path += "/" + lastLabel; 288 } 289 lastLabel = null; 290 break; 291 case CLOSE_BRACE: 292 if (lastLabel != null) { 293 addPath(path, lastLabel, icuData); 294 lastLabel = null; 295 } 296 297 if (arrayValues == null) { 298 path = oldPaths.remove(oldPaths.size() - 1); 299 indices.remove(indices.size() - 1); 300 } else { 301 // Value array closed, add it to the path. 302 String[] array = new String[0]; 303 addPath(path, arrayValues.toArray(array), icuData); 304 arrayValues = null; 305 } 306 if (DEBUG) 307 System.out.println("POP:\t" + path); 308 break; 309 case COMMA: 310 if (lastToken != MyTokenizer.Type.QUOTED && lastToken != MyTokenizer.Type.ID) { 311 throw new IllegalArgumentException(filename + ", " + path + ": Commas can only occur after values "); 312 } else if (lastLabel == null) { 313 throw new IllegalArgumentException(filename + ": Label missing!"); 314 } 315 if (arrayValues != null) { 316 arrayValues.add(lastLabel); 317 } else { 318 addPath(path, lastLabel, icuData); 319 } 320 lastLabel = null; 321 break; 322 default: 323 throw new IllegalArgumentException("Illegal type in " + filename + ": " + nextToken + "\t" + tokenText 324 + "\t" + Utility.hex(tokenText)); 325 } 326 lastToken = nextToken; 327 } 328 } 329 addPath(String path, String value, IcuData icuData)330 private static void addPath(String path, String value, IcuData icuData) { 331 addPath(path, new String[] { value }, icuData); 332 } 333 addPath(String path, String[] values, IcuData icuData)334 private static void addPath(String path, String[] values, IcuData icuData) { 335 path = path.substring(path.indexOf('/', 1)); 336 icuData.add(path, values); 337 } 338 339 /** 340 * Reads in tokens from an ICU data file reader. 341 * Replace by updated PatternTokenizer someday 342 * 343 * @author markdavis 344 * 345 */ 346 static class MyTokenizer { 347 enum Type { 348 DONE, ID, QUOTED, OPEN_BRACE, CLOSE_BRACE, COMMA, LINE_COMMENT, BLOCK_COMMENT, BROKEN_QUOTE, BROKEN_BLOCK_COMMENT, UNKNOWN 349 } 350 351 private final UForwardCharacterIterator source; 352 private final UnicodeSet spaceCharacters = new UnicodeSet("[\\u0000\\uFEFF[:pattern_whitespace:]]"); 353 private final UnicodeSet idCharacters = new UnicodeSet("[-+.():%\"'[:xid_continue:]]"); 354 private final UnicodeSet quoteCharacters = new UnicodeSet("[\"']"); 355 356 private int bufferedChar; 357 358 /** 359 * @param reader 360 */ MyTokenizer(Reader reader)361 public MyTokenizer(Reader reader) { 362 this.source = new UReaderForwardCharacterIterator(reader); 363 } 364 next(StringBuffer tokenText)365 public Type next(StringBuffer tokenText) { 366 int cp = getCodePoint(); 367 // Skip all spaces not in quotes. 368 while (cp >= 0 && spaceCharacters.contains(cp)) { 369 cp = getCodePoint(); 370 } 371 372 if (cp == -1) { 373 return Type.DONE; 374 } 375 tokenText.setLength(0); 376 if (cp == '/') { 377 cp = getCodePoint(); 378 if (cp == '/') { // line comment 379 while (true) { 380 cp = getCodePoint(); 381 if (cp == '\n' || cp < 0) { 382 return Type.LINE_COMMENT; 383 } 384 tokenText.appendCodePoint(cp); 385 } 386 } else if (cp == '*') { // block comment 387 while (true) { 388 cp = getCodePoint(); 389 if (cp < 0) { 390 return Type.BROKEN_BLOCK_COMMENT; 391 } 392 while (cp == '*') { 393 int cp2 = getCodePoint(); 394 if (cp2 < 0) { 395 return Type.BROKEN_BLOCK_COMMENT; 396 } else if (cp2 == '/') { 397 return Type.BLOCK_COMMENT; 398 } 399 tokenText.appendCodePoint(cp); 400 cp = cp2; 401 } 402 tokenText.appendCodePoint(cp); 403 } 404 } else { 405 throw new IllegalArgumentException("/ can only be in quotes or comments"); 406 } 407 } 408 if (quoteCharacters.contains(cp)) { 409 // Return the text inside and *excluding* the quotes. 410 int oldQuote = cp; 411 cp = getCodePoint(); 412 while (cp != oldQuote) { 413 if (cp < 0) { 414 return Type.BROKEN_QUOTE; 415 } else if (cp == '\\') { 416 tokenText.appendCodePoint(cp); 417 cp = getCodePoint(); 418 if (cp < 0) { 419 return Type.BROKEN_QUOTE; 420 } 421 } 422 tokenText.appendCodePoint(cp); 423 cp = getCodePoint(); 424 } 425 ; 426 return Type.QUOTED; 427 } 428 if (cp == '{') { 429 return Type.OPEN_BRACE; 430 } 431 if (cp == '}') { 432 return Type.CLOSE_BRACE; 433 } 434 if (cp == ',') { 435 return Type.COMMA; 436 } 437 if (idCharacters.contains(cp)) { 438 while (true) { 439 tokenText.appendCodePoint(cp); 440 cp = getCodePoint(); 441 if (cp < 0 || !idCharacters.contains(cp)) { 442 pushCodePoint(cp); 443 return Type.ID; 444 } 445 } 446 } 447 tokenText.appendCodePoint(cp); 448 return Type.UNKNOWN; 449 } 450 getCodePoint()451 int getCodePoint() { 452 if (bufferedChar >= 0) { 453 int result = bufferedChar; 454 bufferedChar = -1; 455 return result; 456 } 457 return source.nextCodePoint(); 458 } 459 pushCodePoint(int codepoint)460 void pushCodePoint(int codepoint) { 461 if (bufferedChar >= 0) { 462 throw new IllegalArgumentException("Cannot push twice"); 463 } 464 bufferedChar = codepoint; 465 } 466 } 467 468 public static class UReaderForwardCharacterIterator implements UForwardCharacterIterator { 469 private Reader reader; 470 private int bufferedChar = -1; 471 472 /** 473 * @param reader 474 */ UReaderForwardCharacterIterator(Reader reader)475 public UReaderForwardCharacterIterator(Reader reader) { 476 this.reader = reader; 477 } 478 479 /* 480 * (non-Javadoc) 481 * 482 * @see com.ibm.icu.text.UForwardCharacterIterator#next() 483 */ next()484 public int next() { 485 if (bufferedChar >= 0) { 486 int temp = bufferedChar; 487 bufferedChar = -1; 488 return temp; 489 } 490 try { 491 return reader.read(); 492 } catch (IOException e) { 493 throw new IllegalArgumentException(e); 494 } 495 } 496 497 /* 498 * (non-Javadoc) 499 * 500 * @see com.ibm.icu.text.UForwardCharacterIterator#nextCodePoint() 501 */ nextCodePoint()502 public int nextCodePoint() { 503 int ch1 = next(); 504 if (UTF16.isLeadSurrogate((char) ch1)) { 505 int bufferedChar = next(); 506 if (UTF16.isTrailSurrogate((char) bufferedChar)) { 507 return UCharacter.getCodePoint((char) ch1, 508 (char) bufferedChar); 509 } 510 } 511 return ch1; 512 } 513 } 514 } 515