• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  *
3  */
4 package org.unicode.cldr.tool;
5 
6 import java.io.BufferedReader;
7 import java.io.IOException;
8 import java.util.Iterator;
9 import java.util.LinkedHashMap;
10 import java.util.Locale;
11 import java.util.Map;
12 import java.util.Random;
13 //import java.util.Set;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16 
17 import org.unicode.cldr.draft.FileUtilities;
18 import org.unicode.cldr.util.BNF;
19 import org.unicode.cldr.util.CldrUtility;
20 import org.unicode.cldr.util.LanguageTagParser;
21 //import org.unicode.cldr.util.StandardCodes;
22 import org.unicode.cldr.util.Quoter;
23 
24 import com.ibm.icu.util.ULocale;
25 
26 /**
27  * Tests language tags.
28  * <p>
29  * Internally, it generates a Regex Pattern for BCP 47 language tags, plus an ICU BNF pattern. The first is a regular
30  * Java/Perl style pattern. The ICU BNF will general random strings that will match that regex.
31  * <p>
32  * Use -Dbnf=xxx for the source regex definition file, and -Dtest=yyy for the test file Example:
33  * -Dbnf=/Users/markdavis/Documents/workspace/cldr-code/java/org/unicode/cldr/util/data/langtagRegex.txt
34  *
35  * @author markdavis
36  *
37  */
38 class CheckLangTagBNF {
39     private static final String LANGUAGE_TAG_TEST_FILE = CldrUtility.getProperty("test");
40     private static final String BNF_DEFINITION_FILE = CldrUtility.getProperty("bnf");
41 
42     private String rules;
43     private String generationRules;
44     private Pattern pattern;
45     private BNF bnf;
46 
47     private static final String[] groupNames = { "whole", "lang", "script", "region", "variants", "extensions",
48         "privateuse",
49         "grandfathered", "privateuse", "localeExtensions"
50     };
51 
52     /**
53      * Set the regex to use for testing, based on the contents of a file.
54      *
55      * @param filename
56      * @return
57      * @throws IOException
58      */
setFromFile(String filename)59     public CheckLangTagBNF setFromFile(String filename) throws IOException {
60         BufferedReader in = FileUtilities.openUTF8Reader("", filename);
61         CldrUtility.VariableReplacer result = new CldrUtility.VariableReplacer();
62         String variable = null;
63         StringBuffer definition = new StringBuffer();
64         StringBuffer ruleBuffer = new StringBuffer();
65         StringBuffer generationRuleBuffer = new StringBuffer();
66         for (int count = 1;; ++count) {
67             String line = in.readLine();
68             if (line == null) break;
69             ruleBuffer.append(line).append(CldrUtility.LINE_SEPARATOR);
70             // remove initial bom, comments
71             if (line.length() == 0) continue;
72             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
73             int hashPos = line.indexOf('#');
74             if (hashPos >= 0) line = line.substring(0, hashPos);
75             String trimline = line.trim();
76             if (trimline.length() == 0) continue;
77             generationRuleBuffer.append(trimline).append(CldrUtility.LINE_SEPARATOR);
78 
79             // String[] lineParts = line.split(";");
80             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
81             if (linePart.trim().length() == 0) continue;
82             boolean terminated = trimline.endsWith(";");
83             if (terminated) {
84                 linePart = linePart.substring(0, linePart.lastIndexOf(';'));
85             }
86             int equalsPos = linePart.indexOf('=');
87             if (equalsPos >= 0) {
88                 if (variable != null) {
89                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
90                 }
91                 variable = linePart.substring(0, equalsPos).trim();
92                 definition.append(linePart.substring(equalsPos + 1).trim());
93             } else { // no equals, so
94                 if (variable == null) {
95                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
96                 }
97                 definition.append(CldrUtility.LINE_SEPARATOR).append(linePart);
98             }
99             // we are terminated if i is not at the end, or the line ends with a ;
100             if (terminated) {
101                 result.add(variable, result.replace(definition.toString()));
102                 variable = null; // signal we have no variable
103                 definition.setLength(0);
104             }
105         }
106         if (variable != null) {
107             throw new IllegalArgumentException("Missing ';' at end");
108         }
109         String resolved = result.replace("$root").replaceAll("[0-9]+%", "");
110         System.out.println("Regex: " + resolved);
111         rules = ruleBuffer.toString();
112         generationRules = generationRuleBuffer.toString().replaceAll("\\?:", "").replaceAll("\\(\\?i\\)", "");
113         pattern = Pattern.compile(resolved, Pattern.COMMENTS);
114         return this;
115     }
116 
117     private static Random random = new Random(3);
118 
randomizeAsciiCase(String s)119     private static String randomizeAsciiCase(String s) {
120         StringBuilder result = new StringBuilder();
121         for (int i = 0; i < s.length(); ++i) {
122             char c = s.charAt(i);
123             if ('A' <= c && c <= 'Z') {
124                 if (random.nextBoolean()) {
125                     c += 32;
126                 }
127             } else if ('a' <= c && c <= 'z') {
128                 if (random.nextBoolean()) {
129                     c -= 32;
130                 }
131             }
132             result.append(c);
133         }
134         return result.toString();
135     }
136 
getBnf()137     public BNF getBnf() {
138         if (bnf != null) return bnf;
139         bnf = new BNF(new Random(2), new Quoter.RuleQuoter())
140             .setMaxRepeat(5)
141             .addRules(generationRules)
142             .complete();
143         return bnf;
144     }
145 
getPattern()146     public Pattern getPattern() {
147         return pattern;
148     }
149 
getRules()150     public String getRules() {
151         return rules;
152     }
153 
getGenerationRules()154     public String getGenerationRules() {
155         return generationRules;
156     }
157 
158     /**
159      * Tests a file for correctness.
160      * There are two special lines in the file: WELL-FORMED and ILL-FORMED,
161      * that signal the start of each section.
162      *
163      * @param args
164      * @throws IOException
165      */
main(String[] args)166     public static void main(String[] args) throws IOException {
167         CheckLangTagBNF bnfData = new CheckLangTagBNF();
168         bnfData.setFromFile(BNF_DEFINITION_FILE);
169         String contents = bnfData.getRules();
170         Pattern pat = bnfData.getPattern();
171         Matcher regexLanguageTag = pat.matcher("");
172 
173         Locale loc = new Locale("fOo", "fIi", "bAr");
174         System.out.println("locale.getLanguage " + loc.getLanguage());
175         System.out.println("locale.getCountry " + loc.getCountry());
176         System.out.println("locale.getVariant " + loc.getVariant());
177 
178         ULocale loc2 = new ULocale("eS_latN-eS@currencY=EUR;collatioN=traditionaL");
179         System.out.println("ulocale.getLanguage " + loc2.getLanguage());
180         System.out.println("ulocale.getScript " + loc2.getScript());
181         System.out.println("ulocale.getCountry " + loc2.getCountry());
182         System.out.println("ulocale.getVariant " + loc2.getVariant());
183         for (Iterator<String> it = loc2.getKeywords(); it.hasNext();) {
184             String keyword = it.next();
185             System.out.println("\tulocale.getKeywords " + keyword + " = " + loc2.getKeywordValue(keyword));
186         }
187 
188         BNF bnf = bnfData.getBnf();
189         for (int i = 0; i < 100; ++i) {
190             String trial = bnf.next();
191             trial = randomizeAsciiCase(trial);
192             System.out.println(trial);
193             if (!regexLanguageTag.reset(trial).matches()) {
194                 throw new IllegalArgumentException("Regex generation fails with: " + trial);
195             }
196         }
197 
198         // generate a bunch of ill-formed items. Try to favor ones that might actually cause problems.
199         // TODO make all numeric and all alpha more common
200         System.out.println("*** ILL-FORMED ***");
201         BNF invalidBNF = new BNF(new Random(0), new Quoter.RuleQuoter())
202             .setMaxRepeat(5)
203             .addRules("$tag = ([A-Z a-z 0-9]{1,8} 50% 20% 10% 5% 5% 5% 5%);")
204             .addRules("$s = [-_] ;")
205             .addRules("$root = $tag ($s $tag){0,7} 10% 10% 10% 10% 10% 10% 10% 10% ; ")
206             .complete();
207 
208         for (int i = 0; i < 100; ++i) {
209             String trial = invalidBNF.next();
210             if (regexLanguageTag.reset(trial).matches()) {
211                 continue;
212             }
213             System.out.println(trial);
214         }
215 
216         System.out.println(contents);
217 
218         // System.out.println(langTagPattern);
219         // System.out.println(cleanedLangTagPattern);
220 //        StandardCodes sc = StandardCodes.make();
221 //        Set<String> grandfathered = sc.getAvailableCodes("grandfathered");
222         // for (Iterator it = grandfathered.iterator(); it.hasNext();) {
223         // System.out.print(it.next() + " | ");
224         // }
225         // System.out.println();
226 
227         LanguageTagParser ltp = new LanguageTagParser();
228         SimpleLocaleParser simpleLocaleParser = new SimpleLocaleParser();
229         boolean expected = true;
230         int errorCount = 0;
231         BufferedReader in = FileUtilities.openUTF8Reader("", LANGUAGE_TAG_TEST_FILE);
232 
233         while (true) {
234             String test = in.readLine();
235             if (test == null) break;
236 
237             // remove initial bom, comments
238             if (test.length() == 0) continue;
239             if (test.charAt(0) == '\uFEFF') test = test.substring(1);
240             int hashPos = test.indexOf('#');
241             if (hashPos >= 0) test = test.substring(0, hashPos);
242             test = test.trim(); // this may seem redundant, but we need it for the test for final ;
243             if (test.length() == 0) continue;
244 
245             if (test.equalsIgnoreCase("WELL-FORMED")) {
246                 expected = true;
247                 continue;
248             } else if (test.equalsIgnoreCase("ILL-FORMED")) {
249                 expected = false;
250                 continue;
251             }
252             System.out.println("Parsing " + test);
253             checkParse(ltp, simpleLocaleParser, test);
254             boolean matches = regexLanguageTag.reset(test).matches();
255             if (matches != expected) {
256                 System.out.println("*** TEST FAILURE ***");
257                 ++errorCount;
258             }
259 
260             System.out.println("\tregex?\t" + matches
261                 + (matches == expected ? "" : "\t EXPECTED: " + expected + " for\t" + test));
262             if (matches) {
263                 for (int j = 0; j <= regexLanguageTag.groupCount(); ++j) {
264                     String g = regexLanguageTag.group(j);
265                     if (g == null || g.length() == 0) continue;
266                     System.out.println("\t" + j + "\t" + CheckLangTagBNF.groupNames[j] + ":\t" + g);
267                 }
268             }
269         }
270         System.out.println("Error count: " + errorCount);
271     }
272 
checkParse(LanguageTagParser ltp, SimpleLocaleParser slp, String test)273     private static void checkParse(LanguageTagParser ltp, SimpleLocaleParser slp, String test) {
274         try {
275             ltp.set(test);
276             boolean couldParse = slp.set(test);
277             if (!couldParse) {
278                 System.out.println("###Coundn't parse: test");
279             } else {
280                 System.out.println("Simple Parser: " + slp.toString());
281                 String lang = ltp.getLanguage();
282                 if (lang.length() == 0) {
283                     lang = "und";
284                 }
285                 checkStrings("language", lang, slp.getLanguage());
286                 checkStrings("script", ltp.getScript(), slp.getScript());
287                 checkStrings("country", ltp.getRegion(), slp.getCountry());
288                 checkStrings("variants", ltp.getVariants(), slp.getVariants());
289                 Map<String, String> foo = new LinkedHashMap<String, String>();
290                 foo.putAll(ltp.getExtensions());
291                 foo.putAll(ltp.getLocaleExtensions());
292                 checkStrings("variants", foo, slp.getExtensions());
293             }
294 
295             if (ltp.getLanguage().length() != 0)
296                 System.out.println("\tlang:    \t" + ltp.getLanguage()
297                     + (ltp.isGrandfathered() ? " (grandfathered)" : ""));
298             if (ltp.getScript().length() != 0) System.out.println("\tscript:\t" + ltp.getScript());
299             if (ltp.getRegion().length() != 0) System.out.println("\tregion:\t" + ltp.getRegion());
300             if (ltp.getVariants().size() != 0) System.out.println("\tvariants:\t" + ltp.getVariants());
301             if (ltp.getExtensions().size() != 0) System.out.println("\textensions:\t" + ltp.getExtensions());
302             if (ltp.getLocaleExtensions().size() != 0)
303                 System.out.println("\tlocale extensions:\t" + ltp.getLocaleExtensions());
304             System.out.println("\tisValid?\t" + ltp.isValid());
305         } catch (Exception e) {
306             System.out.println("\t" + e.getMessage());
307             System.out.println("\tisValid?\tfalse");
308         }
309     }
310 
checkStrings(String message, T obj1, T obj2)311     private static <T> void checkStrings(String message, T obj1, T obj2) {
312         String object1 = obj1.toString().replace('_', '-');
313         String object2 = obj2.toString().replace('_', '-');
314         if (!object1.equals(object2)) {
315             if (object1.equalsIgnoreCase(object2)) {
316                 System.out.println("$$$Case Difference at " + message + "<" + obj1 + "> != <" + obj2 + ">");
317             } else {
318                 System.out.println("###Difference at " + message + "<" + obj1 + "> != <" + obj2 + ">");
319             }
320         }
321     }
322 }