• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.icu;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.Reader;
8 import java.util.ArrayList;
9 import java.util.Collections;
10 import java.util.Comparator;
11 import java.util.List;
12 import java.util.Set;
13 import java.util.TreeSet;
14 
15 import org.unicode.cldr.draft.FileUtilities;
16 import org.unicode.cldr.tool.Option.Options;
17 
18 import com.ibm.icu.impl.Row;
19 import com.ibm.icu.impl.Row.R2;
20 import com.ibm.icu.impl.Utility;
21 import com.ibm.icu.lang.UCharacter;
22 import com.ibm.icu.text.UForwardCharacterIterator;
23 import com.ibm.icu.text.UTF16;
24 import com.ibm.icu.text.UnicodeSet;
25 
26 /**
27  * Compares the contents of ICU data output while ignoring comments.
28  *
29  * @author markdavis, jchye
30  *
31  */
32 public class CompareIcuOutput {
33     private static final boolean DEBUG = false;
34 
35     private static final Options options = new Options(
36         "Usage: RBChecker [OPTIONS] DIR1 DIR2 FILE_REGEX\n" +
37             "This program is used to compare the RB text files in two different directories.\n" +
38             "  Example: org.unicode.cldr.icu.RBChecker olddatadir newdatadir .*")
39                 .add("sort", 's', null, null, "Sort values for comparison");
40 
41     private static final Comparator<String[]> comparator = new Comparator<String[]>() {
42         @Override
43         public int compare(String[] arg0, String[] arg1) {
44             return arg0[0].compareTo(arg1[0]);
45         }
46     };
47 
48     private static boolean shouldSort = false;
49 
main(String[] args)50     public static void main(String[] args) throws IOException {
51         String dir1 = args[0];
52         String dir2 = args[1];
53         String regex = args[2];
54         System.out.println("dir1 " + dir1);
55         System.out.println("dir2 " + dir2);
56         System.out.println("regex " + regex);
57         shouldSort = options.get("sort").doesOccur();
58         long totaltime = System.currentTimeMillis();
59         System.out.println("Comparing the contents of text files...");
60         compareTextFiles(dir1, dir2, regex);
61         System.out.println("Total time taken: " + (System.currentTimeMillis() - totaltime));
62     }
63 
64     /**
65      * Parses and compares two ICU textfiles.
66      *
67      * @param dir1
68      * @param dir2
69      * @param regex
70      * @throws IOException
71      */
compareTextFiles(String dir1, String dir2, String regex)72     private static void compareTextFiles(String dir1, String dir2, String regex) throws IOException {
73         File localeDir = new File(dir1);
74         if (!localeDir.exists()) localeDir = new File(dir1);
75         String[] filenames = localeDir.list();
76         int same = 0, different = 0;
77         for (String filename : filenames) {
78             if (!filename.matches(regex + "\\.txt")) continue;
79             String locale = filename.substring(0, filename.length() - 4);
80             try {
81                 IcuData oldData = loadDataFromTextfiles(dir1, locale);
82                 IcuData newData = loadDataFromTextfiles(dir2, locale);
83                 StringBuffer messages = new StringBuffer();
84                 if (analyseMatches(oldData, newData, messages)) {
85                     System.out.println("=== Differences found for " + locale + " ===");
86                     System.out.print(messages);
87                     different++;
88                 } else {
89                     same++;
90                 }
91             } catch (FileNotFoundException e) {
92                 System.err.println(locale + " file not found, skipping");
93             }
94         }
95         System.out.println("Check finished with " + different + " different and " + same + " same locales.");
96     }
97 
loadDataFromTextfiles(String icuPath, String locale)98     private static IcuData loadDataFromTextfiles(String icuPath, String locale) throws IOException {
99         List<Row.R2<MyTokenizer.Type, String>> comments = new ArrayList<Row.R2<MyTokenizer.Type, String>>();
100         IcuData icuData = new IcuData(locale + ".xml", locale, true);
101         String filename = icuPath + '/' + locale + ".txt";
102         if (new File(filename).exists()) {
103             parseRB(filename, icuData, comments);
104         } else {
105             throw new FileNotFoundException(filename + " does not exist.");
106         }
107         return icuData;
108     }
109 
110     /**
111      * Computes lists of all differences between two sets of IcuData.
112      *
113      * @param oldData
114      * @param newData
115      */
analyseMatches(IcuData oldData, IcuData newData, StringBuffer buffer)116     private static boolean analyseMatches(IcuData oldData, IcuData newData, StringBuffer buffer) {
117         boolean hasDifferences = false;
118         Set<String> missing = new TreeSet<String>(oldData.keySet());
119         missing.removeAll(newData.keySet());
120         if (missing.size() > 0) {
121             buffer.append("Missing paths:\n");
122             printAllInSet(oldData, missing, buffer);
123             hasDifferences = true;
124         }
125         Set<String> extra = new TreeSet<String>(newData.keySet());
126         extra.removeAll(oldData.keySet());
127         if (extra.size() > 0) {
128             buffer.append("Extra paths:\n");
129             printAllInSet(newData, extra, buffer);
130             hasDifferences = true;
131         }
132         Set<String> common = new TreeSet<String>(oldData.keySet());
133         common.retainAll(newData.keySet());
134         for (String rbPath : common) {
135             if (rbPath.startsWith("/Version")) continue; // skip version
136             List<String[]> oldValues = oldData.get(rbPath);
137             List<String[]> newValues = newData.get(rbPath);
138             if (shouldSort) {
139                 Collections.sort(oldValues, comparator);
140                 Collections.sort(newValues, comparator);
141             }
142             // Print out any value differences.
143             if (valuesDiffer(oldValues, newValues)) {
144                 buffer.append(rbPath + " contains differences:\n");
145                 buffer.append("\tOld: ");
146                 printValues(oldValues, buffer);
147                 buffer.append("\tNew: ");
148                 printValues(newValues, buffer);
149                 hasDifferences = true;
150             }
151         }
152         return hasDifferences;
153     }
154 
printAllInSet(IcuData icuData, Set<String> paths, StringBuffer buffer)155     private static void printAllInSet(IcuData icuData, Set<String> paths, StringBuffer buffer) {
156         for (String path : paths) {
157             buffer.append("\t" + path + " = ");
158             printValues(icuData.get(path), buffer);
159         }
160     }
161 
printValues(List<String[]> values, StringBuffer buffer)162     private static void printValues(List<String[]> values, StringBuffer buffer) {
163         // Enclose both numbers and strings in quotes for simplicity.
164         for (String[] array : values) {
165             if (array.length == 1) {
166                 buffer.append('"' + array[0] + '"');
167             } else {
168                 buffer.append("[");
169                 for (String value : array) {
170                     buffer.append('"' + value + "\", ");
171                 }
172                 buffer.append("]");
173             }
174             buffer.append(", ");
175         }
176         buffer.append('\n');
177     }
178 
179     /**
180      * @param oldValues
181      * @param newValues
182      * @return true if the contents of the lists are identical
183      */
valuesDiffer(List<String[]> oldValues, List<String[]> newValues)184     private static boolean valuesDiffer(List<String[]> oldValues, List<String[]> newValues) {
185         if (oldValues.size() != newValues.size()) return true;
186         boolean differ = false;
187         for (int i = 0; i < oldValues.size(); i++) {
188             String[] oldArray = oldValues.get(i);
189             String[] newArray = newValues.get(i);
190             if (oldArray.length != newArray.length) {
191                 differ = true;
192                 break;
193             }
194             for (int j = 0; j < oldArray.length; j++) {
195                 // Ignore whitespace.
196                 if (!oldArray[j].replace(" ", "").equals(newArray[j].replace(" ", ""))) {
197                     differ = true;
198                     break;
199                 }
200             }
201         }
202         return differ;
203     }
204 
205     /**
206      * Parse an ICU resource bundle into key,value items
207      *
208      * @param filename
209      * @param output
210      * @param comments
211      */
parseRB(String filename, IcuData icuData, List<R2<MyTokenizer.Type, String>> comments)212     static void parseRB(String filename, IcuData icuData, List<R2<MyTokenizer.Type, String>> comments)
213         throws IOException {
214         BufferedReader in = null;
215         File file = new File(filename);
216         String coreFile = file.getName();
217         if (!coreFile.endsWith(".txt")) {
218             throw new IllegalArgumentException("missing .txt in: " + filename);
219         }
220         coreFile = coreFile.substring(0, coreFile.length() - 4);
221         // redo this later on to use fixed PatternTokenizer
222         in = FileUtilities.openUTF8Reader("", filename);
223         MyTokenizer tokenIterator = new MyTokenizer(in);
224         StringBuffer tokenText = new StringBuffer();
225         List<String> oldPaths = new ArrayList<String>();
226         List<Integer> indices = new ArrayList<Integer>();
227         String lastLabel = null;
228         String path = "";
229         /*
230          * AuxExemplarCharacters{
231          * "[á à ă â å ä ã ā æ ç é è ĕ ê ë ē í ì ĭ î ï ī ñ ó ò ŏ ô ö ø ō œ ú ù ŭ û ü ū ÿ"
232          * "]" } ExemplarCharacters{
233          * "[a b c d e f g h i j k l m n o p q r s t u v w x y z]"}
234          * ExemplarCharactersCurrency
235          * {"[a b c č d e f g h i j k l ł m n o º p q r s t u v w x y z]"}
236          * ExemplarCharactersIndex
237          * {"[A B C D E F G H I J K L M N O P Q R S T U V W X Y Z]"}
238          * ExemplarCharactersPunctuation{"[\- ‐ – — , ; \: ! ? . … ' ‘ ’ \"
239          * “ ” ( ) \[ \] @ * / \& # † ‡ ′ ″ §]"}
240          */
241         MyTokenizer.Type lastToken = null;
242         List<String> arrayValues = null;
243         while (true) {
244             MyTokenizer.Type nextToken = tokenIterator.next(tokenText);
245             if (DEBUG)
246                 System.out.println(nextToken + "\t" + tokenText);
247             switch (nextToken) {
248             case BLOCK_COMMENT:
249             case LINE_COMMENT:
250                 if (comments != null) {
251                     comments.add(Row.of(nextToken, tokenText.toString()));
252                 }
253                 continue;
254             case DONE:
255                 if (oldPaths.size() != 0) {
256                     throw new IllegalArgumentException("missing }");
257                 }
258                 in.close();
259                 return;
260             case ID:
261                 lastLabel = lastLabel == null ? tokenText.toString() : lastLabel + " " + tokenText;
262                 break;
263             case QUOTED:
264                 if (lastLabel == null) {
265                     lastLabel = tokenText.toString();
266                 } else {
267                     // Remove consecutive quotes.
268                     lastLabel += tokenText;
269                 }
270                 break;
271             case OPEN_BRACE:
272                 // Check for array-type values.
273                 if (lastToken == MyTokenizer.Type.COMMA) {
274                     arrayValues = new ArrayList<String>();
275                 } else {
276                     oldPaths.add(path);
277                     indices.add(0);
278                     if (lastToken == MyTokenizer.Type.OPEN_BRACE || lastToken == MyTokenizer.Type.CLOSE_BRACE) {
279                         int currentIndexPos = indices.size() - 2;
280                         int currentIndex = indices.get(currentIndexPos);
281                         lastLabel = "<" + currentIndex + ">";
282                         indices.set(currentIndexPos, currentIndex + 1);
283                     } else if (lastLabel.contains(":") && !lastLabel.contains(":int") && !lastLabel.contains(":alias")
284                         || path.endsWith("/relative")) {
285                         lastLabel = '"' + lastLabel + '"';
286                     }
287                     path += "/" + lastLabel;
288                 }
289                 lastLabel = null;
290                 break;
291             case CLOSE_BRACE:
292                 if (lastLabel != null) {
293                     addPath(path, lastLabel, icuData);
294                     lastLabel = null;
295                 }
296 
297                 if (arrayValues == null) {
298                     path = oldPaths.remove(oldPaths.size() - 1);
299                     indices.remove(indices.size() - 1);
300                 } else {
301                     // Value array closed, add it to the path.
302                     String[] array = new String[0];
303                     addPath(path, arrayValues.toArray(array), icuData);
304                     arrayValues = null;
305                 }
306                 if (DEBUG)
307                     System.out.println("POP:\t" + path);
308                 break;
309             case COMMA:
310                 if (lastToken != MyTokenizer.Type.QUOTED && lastToken != MyTokenizer.Type.ID) {
311                     throw new IllegalArgumentException(filename + ", " + path + ": Commas can only occur after values ");
312                 } else if (lastLabel == null) {
313                     throw new IllegalArgumentException(filename + ": Label missing!");
314                 }
315                 if (arrayValues != null) {
316                     arrayValues.add(lastLabel);
317                 } else {
318                     addPath(path, lastLabel, icuData);
319                 }
320                 lastLabel = null;
321                 break;
322             default:
323                 throw new IllegalArgumentException("Illegal type in " + filename + ": " + nextToken + "\t" + tokenText
324                     + "\t" + Utility.hex(tokenText));
325             }
326             lastToken = nextToken;
327         }
328     }
329 
addPath(String path, String value, IcuData icuData)330     private static void addPath(String path, String value, IcuData icuData) {
331         addPath(path, new String[] { value }, icuData);
332     }
333 
addPath(String path, String[] values, IcuData icuData)334     private static void addPath(String path, String[] values, IcuData icuData) {
335         path = path.substring(path.indexOf('/', 1));
336         icuData.add(path, values);
337     }
338 
339     /**
340      * Reads in tokens from an ICU data file reader.
341      * Replace by updated PatternTokenizer someday
342      *
343      * @author markdavis
344      *
345      */
346     static class MyTokenizer {
347         enum Type {
348             DONE, ID, QUOTED, OPEN_BRACE, CLOSE_BRACE, COMMA, LINE_COMMENT, BLOCK_COMMENT, BROKEN_QUOTE, BROKEN_BLOCK_COMMENT, UNKNOWN
349         }
350 
351         private final UForwardCharacterIterator source;
352         private final UnicodeSet spaceCharacters = new UnicodeSet("[\\u0000\\uFEFF[:pattern_whitespace:]]");
353         private final UnicodeSet idCharacters = new UnicodeSet("[-+.():%\"'[:xid_continue:]]");
354         private final UnicodeSet quoteCharacters = new UnicodeSet("[\"']");
355 
356         private int bufferedChar;
357 
358         /**
359          * @param reader
360          */
MyTokenizer(Reader reader)361         public MyTokenizer(Reader reader) {
362             this.source = new UReaderForwardCharacterIterator(reader);
363         }
364 
next(StringBuffer tokenText)365         public Type next(StringBuffer tokenText) {
366             int cp = getCodePoint();
367             // Skip all spaces not in quotes.
368             while (cp >= 0 && spaceCharacters.contains(cp)) {
369                 cp = getCodePoint();
370             }
371 
372             if (cp == -1) {
373                 return Type.DONE;
374             }
375             tokenText.setLength(0);
376             if (cp == '/') {
377                 cp = getCodePoint();
378                 if (cp == '/') { // line comment
379                     while (true) {
380                         cp = getCodePoint();
381                         if (cp == '\n' || cp < 0) {
382                             return Type.LINE_COMMENT;
383                         }
384                         tokenText.appendCodePoint(cp);
385                     }
386                 } else if (cp == '*') { // block comment
387                     while (true) {
388                         cp = getCodePoint();
389                         if (cp < 0) {
390                             return Type.BROKEN_BLOCK_COMMENT;
391                         }
392                         while (cp == '*') {
393                             int cp2 = getCodePoint();
394                             if (cp2 < 0) {
395                                 return Type.BROKEN_BLOCK_COMMENT;
396                             } else if (cp2 == '/') {
397                                 return Type.BLOCK_COMMENT;
398                             }
399                             tokenText.appendCodePoint(cp);
400                             cp = cp2;
401                         }
402                         tokenText.appendCodePoint(cp);
403                     }
404                 } else {
405                     throw new IllegalArgumentException("/ can only be in quotes or comments");
406                 }
407             }
408             if (quoteCharacters.contains(cp)) {
409                 // Return the text inside and *excluding* the quotes.
410                 int oldQuote = cp;
411                 cp = getCodePoint();
412                 while (cp != oldQuote) {
413                     if (cp < 0) {
414                         return Type.BROKEN_QUOTE;
415                     } else if (cp == '\\') {
416                         tokenText.appendCodePoint(cp);
417                         cp = getCodePoint();
418                         if (cp < 0) {
419                             return Type.BROKEN_QUOTE;
420                         }
421                     }
422                     tokenText.appendCodePoint(cp);
423                     cp = getCodePoint();
424                 }
425                 ;
426                 return Type.QUOTED;
427             }
428             if (cp == '{') {
429                 return Type.OPEN_BRACE;
430             }
431             if (cp == '}') {
432                 return Type.CLOSE_BRACE;
433             }
434             if (cp == ',') {
435                 return Type.COMMA;
436             }
437             if (idCharacters.contains(cp)) {
438                 while (true) {
439                     tokenText.appendCodePoint(cp);
440                     cp = getCodePoint();
441                     if (cp < 0 || !idCharacters.contains(cp)) {
442                         pushCodePoint(cp);
443                         return Type.ID;
444                     }
445                 }
446             }
447             tokenText.appendCodePoint(cp);
448             return Type.UNKNOWN;
449         }
450 
getCodePoint()451         int getCodePoint() {
452             if (bufferedChar >= 0) {
453                 int result = bufferedChar;
454                 bufferedChar = -1;
455                 return result;
456             }
457             return source.nextCodePoint();
458         }
459 
pushCodePoint(int codepoint)460         void pushCodePoint(int codepoint) {
461             if (bufferedChar >= 0) {
462                 throw new IllegalArgumentException("Cannot push twice");
463             }
464             bufferedChar = codepoint;
465         }
466     }
467 
468     public static class UReaderForwardCharacterIterator implements UForwardCharacterIterator {
469         private Reader reader;
470         private int bufferedChar = -1;
471 
472         /**
473          * @param reader
474          */
UReaderForwardCharacterIterator(Reader reader)475         public UReaderForwardCharacterIterator(Reader reader) {
476             this.reader = reader;
477         }
478 
479         /*
480          * (non-Javadoc)
481          *
482          * @see com.ibm.icu.text.UForwardCharacterIterator#next()
483          */
next()484         public int next() {
485             if (bufferedChar >= 0) {
486                 int temp = bufferedChar;
487                 bufferedChar = -1;
488                 return temp;
489             }
490             try {
491                 return reader.read();
492             } catch (IOException e) {
493                 throw new IllegalArgumentException(e);
494             }
495         }
496 
497         /*
498          * (non-Javadoc)
499          *
500          * @see com.ibm.icu.text.UForwardCharacterIterator#nextCodePoint()
501          */
nextCodePoint()502         public int nextCodePoint() {
503             int ch1 = next();
504             if (UTF16.isLeadSurrogate((char) ch1)) {
505                 int bufferedChar = next();
506                 if (UTF16.isTrailSurrogate((char) bufferedChar)) {
507                     return UCharacter.getCodePoint((char) ch1,
508                         (char) bufferedChar);
509                 }
510             }
511             return ch1;
512         }
513     }
514 }
515