1 /** 2 * 3 */ 4 package org.unicode.cldr.tool; 5 6 import java.io.BufferedReader; 7 import java.io.IOException; 8 import java.util.Iterator; 9 import java.util.LinkedHashMap; 10 import java.util.Locale; 11 import java.util.Map; 12 import java.util.Random; 13 //import java.util.Set; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 17 import org.unicode.cldr.draft.FileUtilities; 18 import org.unicode.cldr.util.BNF; 19 import org.unicode.cldr.util.CldrUtility; 20 import org.unicode.cldr.util.LanguageTagParser; 21 //import org.unicode.cldr.util.StandardCodes; 22 import org.unicode.cldr.util.Quoter; 23 24 import com.ibm.icu.util.ULocale; 25 26 /** 27 * Tests language tags. 28 * <p> 29 * Internally, it generates a Regex Pattern for BCP 47 language tags, plus an ICU BNF pattern. The first is a regular 30 * Java/Perl style pattern. The ICU BNF will general random strings that will match that regex. 31 * <p> 32 * Use -Dbnf=xxx for the source regex definition file, and -Dtest=yyy for the test file Example: 33 * -Dbnf=/Users/markdavis/Documents/workspace/cldr-code/java/org/unicode/cldr/util/data/langtagRegex.txt 34 * 35 * @author markdavis 36 * 37 */ 38 class CheckLangTagBNF { 39 private static final String LANGUAGE_TAG_TEST_FILE = CldrUtility.getProperty("test"); 40 private static final String BNF_DEFINITION_FILE = CldrUtility.getProperty("bnf"); 41 42 private String rules; 43 private String generationRules; 44 private Pattern pattern; 45 private BNF bnf; 46 47 private static final String[] groupNames = { "whole", "lang", "script", "region", "variants", "extensions", 48 "privateuse", 49 "grandfathered", "privateuse", "localeExtensions" 50 }; 51 52 /** 53 * Set the regex to use for testing, based on the contents of a file. 54 * 55 * @param filename 56 * @return 57 * @throws IOException 58 */ setFromFile(String filename)59 public CheckLangTagBNF setFromFile(String filename) throws IOException { 60 BufferedReader in = FileUtilities.openUTF8Reader("", filename); 61 CldrUtility.VariableReplacer result = new CldrUtility.VariableReplacer(); 62 String variable = null; 63 StringBuffer definition = new StringBuffer(); 64 StringBuffer ruleBuffer = new StringBuffer(); 65 StringBuffer generationRuleBuffer = new StringBuffer(); 66 for (int count = 1;; ++count) { 67 String line = in.readLine(); 68 if (line == null) break; 69 ruleBuffer.append(line).append(CldrUtility.LINE_SEPARATOR); 70 // remove initial bom, comments 71 if (line.length() == 0) continue; 72 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 73 int hashPos = line.indexOf('#'); 74 if (hashPos >= 0) line = line.substring(0, hashPos); 75 String trimline = line.trim(); 76 if (trimline.length() == 0) continue; 77 generationRuleBuffer.append(trimline).append(CldrUtility.LINE_SEPARATOR); 78 79 // String[] lineParts = line.split(";"); 80 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 81 if (linePart.trim().length() == 0) continue; 82 boolean terminated = trimline.endsWith(";"); 83 if (terminated) { 84 linePart = linePart.substring(0, linePart.lastIndexOf(';')); 85 } 86 int equalsPos = linePart.indexOf('='); 87 if (equalsPos >= 0) { 88 if (variable != null) { 89 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 90 } 91 variable = linePart.substring(0, equalsPos).trim(); 92 definition.append(linePart.substring(equalsPos + 1).trim()); 93 } else { // no equals, so 94 if (variable == null) { 95 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 96 } 97 definition.append(CldrUtility.LINE_SEPARATOR).append(linePart); 98 } 99 // we are terminated if i is not at the end, or the line ends with a ; 100 if (terminated) { 101 result.add(variable, result.replace(definition.toString())); 102 variable = null; // signal we have no variable 103 definition.setLength(0); 104 } 105 } 106 if (variable != null) { 107 throw new IllegalArgumentException("Missing ';' at end"); 108 } 109 String resolved = result.replace("$root").replaceAll("[0-9]+%", ""); 110 System.out.println("Regex: " + resolved); 111 rules = ruleBuffer.toString(); 112 generationRules = generationRuleBuffer.toString().replaceAll("\\?:", "").replaceAll("\\(\\?i\\)", ""); 113 pattern = Pattern.compile(resolved, Pattern.COMMENTS); 114 return this; 115 } 116 117 private static Random random = new Random(3); 118 randomizeAsciiCase(String s)119 private static String randomizeAsciiCase(String s) { 120 StringBuilder result = new StringBuilder(); 121 for (int i = 0; i < s.length(); ++i) { 122 char c = s.charAt(i); 123 if ('A' <= c && c <= 'Z') { 124 if (random.nextBoolean()) { 125 c += 32; 126 } 127 } else if ('a' <= c && c <= 'z') { 128 if (random.nextBoolean()) { 129 c -= 32; 130 } 131 } 132 result.append(c); 133 } 134 return result.toString(); 135 } 136 getBnf()137 public BNF getBnf() { 138 if (bnf != null) return bnf; 139 bnf = new BNF(new Random(2), new Quoter.RuleQuoter()) 140 .setMaxRepeat(5) 141 .addRules(generationRules) 142 .complete(); 143 return bnf; 144 } 145 getPattern()146 public Pattern getPattern() { 147 return pattern; 148 } 149 getRules()150 public String getRules() { 151 return rules; 152 } 153 getGenerationRules()154 public String getGenerationRules() { 155 return generationRules; 156 } 157 158 /** 159 * Tests a file for correctness. 160 * There are two special lines in the file: WELL-FORMED and ILL-FORMED, 161 * that signal the start of each section. 162 * 163 * @param args 164 * @throws IOException 165 */ main(String[] args)166 public static void main(String[] args) throws IOException { 167 CheckLangTagBNF bnfData = new CheckLangTagBNF(); 168 bnfData.setFromFile(BNF_DEFINITION_FILE); 169 String contents = bnfData.getRules(); 170 Pattern pat = bnfData.getPattern(); 171 Matcher regexLanguageTag = pat.matcher(""); 172 173 Locale loc = new Locale("fOo", "fIi", "bAr"); 174 System.out.println("locale.getLanguage " + loc.getLanguage()); 175 System.out.println("locale.getCountry " + loc.getCountry()); 176 System.out.println("locale.getVariant " + loc.getVariant()); 177 178 ULocale loc2 = new ULocale("eS_latN-eS@currencY=EUR;collatioN=traditionaL"); 179 System.out.println("ulocale.getLanguage " + loc2.getLanguage()); 180 System.out.println("ulocale.getScript " + loc2.getScript()); 181 System.out.println("ulocale.getCountry " + loc2.getCountry()); 182 System.out.println("ulocale.getVariant " + loc2.getVariant()); 183 for (Iterator<String> it = loc2.getKeywords(); it.hasNext();) { 184 String keyword = it.next(); 185 System.out.println("\tulocale.getKeywords " + keyword + " = " + loc2.getKeywordValue(keyword)); 186 } 187 188 BNF bnf = bnfData.getBnf(); 189 for (int i = 0; i < 100; ++i) { 190 String trial = bnf.next(); 191 trial = randomizeAsciiCase(trial); 192 System.out.println(trial); 193 if (!regexLanguageTag.reset(trial).matches()) { 194 throw new IllegalArgumentException("Regex generation fails with: " + trial); 195 } 196 } 197 198 // generate a bunch of ill-formed items. Try to favor ones that might actually cause problems. 199 // TODO make all numeric and all alpha more common 200 System.out.println("*** ILL-FORMED ***"); 201 BNF invalidBNF = new BNF(new Random(0), new Quoter.RuleQuoter()) 202 .setMaxRepeat(5) 203 .addRules("$tag = ([A-Z a-z 0-9]{1,8} 50% 20% 10% 5% 5% 5% 5%);") 204 .addRules("$s = [-_] ;") 205 .addRules("$root = $tag ($s $tag){0,7} 10% 10% 10% 10% 10% 10% 10% 10% ; ") 206 .complete(); 207 208 for (int i = 0; i < 100; ++i) { 209 String trial = invalidBNF.next(); 210 if (regexLanguageTag.reset(trial).matches()) { 211 continue; 212 } 213 System.out.println(trial); 214 } 215 216 System.out.println(contents); 217 218 // System.out.println(langTagPattern); 219 // System.out.println(cleanedLangTagPattern); 220 // StandardCodes sc = StandardCodes.make(); 221 // Set<String> grandfathered = sc.getAvailableCodes("grandfathered"); 222 // for (Iterator it = grandfathered.iterator(); it.hasNext();) { 223 // System.out.print(it.next() + " | "); 224 // } 225 // System.out.println(); 226 227 LanguageTagParser ltp = new LanguageTagParser(); 228 SimpleLocaleParser simpleLocaleParser = new SimpleLocaleParser(); 229 boolean expected = true; 230 int errorCount = 0; 231 BufferedReader in = FileUtilities.openUTF8Reader("", LANGUAGE_TAG_TEST_FILE); 232 233 while (true) { 234 String test = in.readLine(); 235 if (test == null) break; 236 237 // remove initial bom, comments 238 if (test.length() == 0) continue; 239 if (test.charAt(0) == '\uFEFF') test = test.substring(1); 240 int hashPos = test.indexOf('#'); 241 if (hashPos >= 0) test = test.substring(0, hashPos); 242 test = test.trim(); // this may seem redundant, but we need it for the test for final ; 243 if (test.length() == 0) continue; 244 245 if (test.equalsIgnoreCase("WELL-FORMED")) { 246 expected = true; 247 continue; 248 } else if (test.equalsIgnoreCase("ILL-FORMED")) { 249 expected = false; 250 continue; 251 } 252 System.out.println("Parsing " + test); 253 checkParse(ltp, simpleLocaleParser, test); 254 boolean matches = regexLanguageTag.reset(test).matches(); 255 if (matches != expected) { 256 System.out.println("*** TEST FAILURE ***"); 257 ++errorCount; 258 } 259 260 System.out.println("\tregex?\t" + matches 261 + (matches == expected ? "" : "\t EXPECTED: " + expected + " for\t" + test)); 262 if (matches) { 263 for (int j = 0; j <= regexLanguageTag.groupCount(); ++j) { 264 String g = regexLanguageTag.group(j); 265 if (g == null || g.length() == 0) continue; 266 System.out.println("\t" + j + "\t" + CheckLangTagBNF.groupNames[j] + ":\t" + g); 267 } 268 } 269 } 270 System.out.println("Error count: " + errorCount); 271 } 272 checkParse(LanguageTagParser ltp, SimpleLocaleParser slp, String test)273 private static void checkParse(LanguageTagParser ltp, SimpleLocaleParser slp, String test) { 274 try { 275 ltp.set(test); 276 boolean couldParse = slp.set(test); 277 if (!couldParse) { 278 System.out.println("###Coundn't parse: test"); 279 } else { 280 System.out.println("Simple Parser: " + slp.toString()); 281 String lang = ltp.getLanguage(); 282 if (lang.length() == 0) { 283 lang = "und"; 284 } 285 checkStrings("language", lang, slp.getLanguage()); 286 checkStrings("script", ltp.getScript(), slp.getScript()); 287 checkStrings("country", ltp.getRegion(), slp.getCountry()); 288 checkStrings("variants", ltp.getVariants(), slp.getVariants()); 289 Map<String, String> foo = new LinkedHashMap<String, String>(); 290 foo.putAll(ltp.getExtensions()); 291 foo.putAll(ltp.getLocaleExtensions()); 292 checkStrings("variants", foo, slp.getExtensions()); 293 } 294 295 if (ltp.getLanguage().length() != 0) 296 System.out.println("\tlang: \t" + ltp.getLanguage() 297 + (ltp.isGrandfathered() ? " (grandfathered)" : "")); 298 if (ltp.getScript().length() != 0) System.out.println("\tscript:\t" + ltp.getScript()); 299 if (ltp.getRegion().length() != 0) System.out.println("\tregion:\t" + ltp.getRegion()); 300 if (ltp.getVariants().size() != 0) System.out.println("\tvariants:\t" + ltp.getVariants()); 301 if (ltp.getExtensions().size() != 0) System.out.println("\textensions:\t" + ltp.getExtensions()); 302 if (ltp.getLocaleExtensions().size() != 0) 303 System.out.println("\tlocale extensions:\t" + ltp.getLocaleExtensions()); 304 System.out.println("\tisValid?\t" + ltp.isValid()); 305 } catch (Exception e) { 306 System.out.println("\t" + e.getMessage()); 307 System.out.println("\tisValid?\tfalse"); 308 } 309 } 310 checkStrings(String message, T obj1, T obj2)311 private static <T> void checkStrings(String message, T obj1, T obj2) { 312 String object1 = obj1.toString().replace('_', '-'); 313 String object2 = obj2.toString().replace('_', '-'); 314 if (!object1.equals(object2)) { 315 if (object1.equalsIgnoreCase(object2)) { 316 System.out.println("$$$Case Difference at " + message + "<" + obj1 + "> != <" + obj2 + ">"); 317 } else { 318 System.out.println("###Difference at " + message + "<" + obj1 + "> != <" + obj2 + ">"); 319 } 320 } 321 } 322 }