1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.ArrayList; 6 import java.util.HashMap; 7 import java.util.Iterator; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.regex.Matcher; 12 import java.util.regex.Pattern; 13 14 import org.unicode.cldr.draft.FileUtilities; 15 import org.unicode.cldr.tool.Option.Options; 16 import org.unicode.cldr.util.CLDRConfig; 17 import org.unicode.cldr.util.CLDRFile; 18 import org.unicode.cldr.util.CLDRFile.DraftStatus; 19 import org.unicode.cldr.util.CLDRPaths; 20 import org.unicode.cldr.util.CoverageInfo; 21 import org.unicode.cldr.util.Factory; 22 import org.unicode.cldr.util.LocaleIDParser; 23 import org.unicode.cldr.util.Organization; 24 import org.unicode.cldr.util.PatternCache; 25 import org.unicode.cldr.util.RegexFileParser; 26 import org.unicode.cldr.util.RegexFileParser.RegexLineParser; 27 import org.unicode.cldr.util.RegexLookup; 28 import org.unicode.cldr.util.StandardCodes; 29 import org.unicode.cldr.util.XMLSource; 30 import org.unicode.cldr.util.XPathParts; 31 32 import com.ibm.icu.util.Output; 33 34 /** 35 * Factory for filtering CLDRFiles by organization and replacing certain values. 36 * Organization coverage data is in org/unicode/cldr/util/data/Locales.txt. 37 * 38 * @author jchye 39 */ 40 public class FilterFactory extends Factory { 41 /** 42 * Types of data modification supported. 43 */ 44 private enum ModificationType { 45 xpath, value; 46 } 47 48 private Factory rawFactory; 49 private String organization; 50 private boolean modifyValues; 51 52 private List<Modifier> modifiers = new ArrayList<>(); 53 54 /** 55 * Creates a new Factory for filtering CLDRFiles. 56 * 57 * @param rawFactory 58 * the factory to be filtered 59 * @param organization 60 * the organization that the filtering is catered towards 61 * @param modifyValues 62 * true if certain values in the data should be modified or replaced 63 */ FilterFactory(Factory rawFactory, String organization, boolean modifyValues)64 private FilterFactory(Factory rawFactory, String organization, boolean modifyValues) { 65 this.rawFactory = rawFactory; 66 this.organization = organization; 67 setSupplementalDirectory(rawFactory.getSupplementalDirectory()); 68 this.modifyValues = modifyValues; 69 } 70 load(Factory rawFactory, String organization, boolean usesAltValue)71 public static FilterFactory load(Factory rawFactory, String organization, boolean usesAltValue) { 72 FilterFactory filterFactory = new FilterFactory(rawFactory, organization, usesAltValue); 73 filterFactory.loadModifiers("dataModifiers.txt"); 74 return filterFactory; 75 } 76 77 @Override getSourceDirectories()78 public File[] getSourceDirectories() { 79 return rawFactory.getSourceDirectories(); 80 } 81 82 @Override getSourceDirectoriesForLocale(String localeID)83 public List<File> getSourceDirectoriesForLocale(String localeID) { 84 return rawFactory.getSourceDirectoriesForLocale(localeID); 85 } 86 87 @Override handleMake(String localeID, boolean resolved, DraftStatus minimalDraftStatus)88 protected CLDRFile handleMake(String localeID, boolean resolved, DraftStatus minimalDraftStatus) { 89 if (resolved) { 90 return new CLDRFile(makeResolvingSource(localeID, minimalDraftStatus)); 91 } else { 92 return filterCldrFile(localeID, minimalDraftStatus); 93 } 94 } 95 96 /** 97 * @return a filtered CLDRFile. 98 */ filterCldrFile(String localeID, DraftStatus minimalDraftStatus)99 private CLDRFile filterCldrFile(String localeID, DraftStatus minimalDraftStatus) { 100 CLDRFile rawFile = rawFactory.make(localeID, false, minimalDraftStatus).cloneAsThawed(); 101 102 filterAltValues(rawFile); 103 filterCoverage(rawFile); 104 removeRedundantPaths(rawFile); 105 return rawFile; 106 } 107 108 /** 109 * Replaces the value for certain XPaths with their alternate value. 110 * 111 * @param rawFile 112 */ filterAltValues(CLDRFile rawFile)113 private void filterAltValues(CLDRFile rawFile) { 114 if (!modifyValues) return; 115 116 for (Modifier modifier : modifiers) { 117 modifier = modifier.filterLocale(rawFile.getLocaleID()); 118 if (!modifier.isEmpty()) { 119 modifier.modifyFile(rawFile); 120 } 121 } 122 } 123 124 /** 125 * Filters a CLDRFile according to the specified organization's coverage level. 126 * 127 * @param rawFile 128 */ filterCoverage(CLDRFile rawFile)129 private void filterCoverage(CLDRFile rawFile) { 130 if (organization == null) return; 131 132 int minLevel = StandardCodes.make() 133 .getLocaleCoverageLevel(organization, rawFile.getLocaleID()) 134 .getLevel(); 135 CoverageInfo covInfo = CLDRConfig.getInstance().getCoverageInfo(); 136 for (String xpath : rawFile) { 137 // Locale metadata shouldn't be stripped. 138 int level = covInfo.getCoverageValue(xpath, rawFile.getLocaleID()); 139 if (level > minLevel) { 140 rawFile.remove(xpath); 141 } 142 } 143 } 144 145 /** 146 * Removes paths with duplicate values that can be found elsewhere in the file. 147 * @param rawFile 148 */ removeRedundantPaths(CLDRFile rawFile)149 private void removeRedundantPaths(CLDRFile rawFile) { 150 if (organization == null || rawFile.getLocaleID().equals("root")) return; 151 152 String parent = LocaleIDParser.getParent(rawFile.getLocaleID()); 153 CLDRFile resolvedParent = rawFactory.make(parent, true); 154 List<String> duplicatePaths = new ArrayList<>(); 155 for (String xpath : rawFile) { 156 if (xpath.startsWith("//ldml/identity")) { 157 continue; 158 } 159 String value = rawFile.getStringValue(xpath); 160 // Remove count="x" if the value is equivalent to count="other". 161 if (xpath.contains("[@count=")) { 162 XPathParts parts = XPathParts.getFrozenInstance(xpath); 163 String count = parts.getAttributeValue(-1, "count"); 164 if (!count.equals("other")) { 165 parts = parts.cloneAsThawed(); 166 parts.setAttribute(-1, "count", "other"); 167 String otherPath = parts.toString(); 168 if (value.equals(rawFile.getStringValue(otherPath))) { 169 duplicatePaths.add(xpath); 170 continue; 171 } 172 } 173 } 174 // Remove xpaths with values also found in the parent. 175 String sourceLocale = resolvedParent.getSourceLocaleID(xpath, null); 176 if (!sourceLocale.equals(XMLSource.CODE_FALLBACK_ID)) { 177 String parentValue = resolvedParent.getStringValue(xpath); 178 if (value.equals(parentValue)) { 179 duplicatePaths.add(xpath); 180 } 181 } 182 } 183 for (String xpath : duplicatePaths) { 184 rawFile.remove(xpath); 185 } 186 } 187 188 @Override getMinimalDraftStatus()189 public DraftStatus getMinimalDraftStatus() { 190 return rawFactory.getMinimalDraftStatus(); 191 } 192 193 @Override handleGetAvailable()194 protected Set<String> handleGetAvailable() { 195 return rawFactory.getAvailable(); 196 } 197 198 /** 199 * Wrapper class for holding information about a value modification entry. 200 */ 201 private class ModifierEntry { 202 String oldValue; 203 String newValue; 204 Map<String, String> options; 205 ModifierEntry(String oldValue, String newValue, Map<String, String> options)206 public ModifierEntry(String oldValue, String newValue, Map<String, String> options) { 207 this.oldValue = oldValue; 208 this.newValue = newValue; 209 this.options = options; 210 } 211 212 /** 213 * @param locale 214 * the locale to be matched 215 * @return true if the locale matches the locale filter in this entry. 216 */ localeMatches(String locale)217 public boolean localeMatches(String locale) { 218 String pattern = options.get("locale"); 219 return pattern == null ? true : locale.matches(pattern); 220 } 221 } 222 223 /** 224 * Class for performing a specific type of data modification on a CLDRFile. 225 */ 226 private abstract class Modifier { 227 protected List<ModifierEntry> entries = new ArrayList<>(); 228 modifyFile(CLDRFile file)229 public abstract void modifyFile(CLDRFile file); 230 filterLocale(String locale)231 public abstract Modifier filterLocale(String locale); 232 233 /** 234 * @return the list of modifiers meant for the specified locale. 235 */ getModifiersForLocale(String locale)236 protected List<ModifierEntry> getModifiersForLocale(String locale) { 237 List<ModifierEntry> newFilters = new ArrayList<>(); 238 for (ModifierEntry filter : entries) { 239 if (filter.localeMatches(locale)) { 240 newFilters.add(filter); 241 } 242 } 243 return newFilters; 244 } 245 246 /** 247 * 248 * @param filter 249 */ addModifierEntry(ModifierEntry entry)250 public void addModifierEntry(ModifierEntry entry) { 251 entries.add(entry); 252 } 253 isEmpty()254 public boolean isEmpty() { 255 return entries.size() == 0; 256 } 257 } 258 259 /** 260 * Maps the value of an XPath onto another XPath. 261 */ 262 private class PathModifier extends Modifier { 263 @Override modifyFile(CLDRFile file)264 public void modifyFile(CLDRFile file) { 265 // For certain alternate values, use them as the main values. 266 for (ModifierEntry entry : entries) { 267 String oldPath = entry.oldValue; 268 String value = file.getStringValue(oldPath); 269 if (value != null) { 270 String newPath = entry.newValue; 271 file.add(newPath, value); 272 file.remove(oldPath); 273 } 274 } 275 } 276 277 @Override filterLocale(String locale)278 public Modifier filterLocale(String locale) { 279 PathModifier newModifier = new PathModifier(); 280 newModifier.entries = getModifiersForLocale(locale); 281 return newModifier; 282 } 283 } 284 285 /** 286 * Replaces certain values with other values. 287 */ 288 private class ValueModifier extends Modifier { 289 @Override modifyFile(CLDRFile file)290 public void modifyFile(CLDRFile file) { 291 // Replace values. 292 for (ModifierEntry entry : entries) { 293 String filteringPath = entry.options.get("xpath"); 294 if (filteringPath != null && isValidXPath(filteringPath)) { 295 // For non-regex XPaths, look them up directly. 296 String value = file.getStringValue(filteringPath); 297 if (value != null) { 298 value = value.replaceAll(entry.oldValue, entry.newValue); 299 file.add(filteringPath, value); 300 } 301 } else { 302 Iterator<String> iterator = file.iterator(); 303 if (filteringPath != null) { 304 Matcher matcher = PatternCache.get(filteringPath).matcher(""); 305 iterator = file.iterator(matcher); 306 } 307 while (iterator.hasNext()) { 308 String xpath = iterator.next(); 309 String originalValue = file.getStringValue(xpath); 310 String value = originalValue.replaceAll(entry.oldValue, entry.newValue); 311 if (!value.equals(originalValue)) { 312 file.add(xpath, value); 313 } 314 } 315 } 316 } 317 } 318 319 @Override filterLocale(String locale)320 public Modifier filterLocale(String locale) { 321 ValueModifier newModifier = new ValueModifier(); 322 newModifier.entries = getModifiersForLocale(locale); 323 return newModifier; 324 } 325 } 326 327 /** 328 * Maps the value of XPaths onto other XPaths using regexes. 329 */ 330 private class PathRegexModifier extends Modifier { 331 private RegexLookup<String> xpathLookup = new RegexLookup<>(); 332 333 @Override addModifierEntry(ModifierEntry entry)334 public void addModifierEntry(ModifierEntry entry) { 335 super.addModifierEntry(entry); 336 xpathLookup.add(entry.oldValue, entry.newValue); 337 } 338 339 @Override modifyFile(CLDRFile file)340 public void modifyFile(CLDRFile file) { 341 if (xpathLookup.size() > 0) { 342 Output<String[]> arguments = new Output<>(); 343 for (String xpath : file) { 344 String newValue = xpathLookup.get(xpath, null, arguments, null, null); 345 if (newValue != null) { 346 String newPath = RegexLookup.replace(newValue, arguments.value); 347 String value = file.getStringValue(xpath); 348 file.add(newPath, value); 349 file.remove(xpath); 350 } 351 } 352 } 353 } 354 355 @Override filterLocale(String locale)356 public Modifier filterLocale(String locale) { 357 PathRegexModifier newModifier = new PathRegexModifier(); 358 newModifier.entries = getModifiersForLocale(locale); 359 for (ModifierEntry entry : newModifier.entries) { 360 newModifier.xpathLookup.add(entry.oldValue, entry.newValue); 361 } 362 return newModifier; 363 } 364 } 365 366 /** 367 * Loads modifiers from a specified file. 368 */ loadModifiers(String filename)369 private void loadModifiers(String filename) { 370 if (!modifyValues) return; 371 final Modifier pathModifier = new PathModifier(); 372 final Modifier pathRegexModifier = new PathRegexModifier(); 373 final Modifier valueModifier = new ValueModifier(); 374 RegexFileParser fileParser = new RegexFileParser(); 375 fileParser.setLineParser(new RegexLineParser() { 376 @Override 377 public void parse(String line) { 378 String[] contents = line.split("\\s*+;\\s*+"); 379 ModificationType filterType = ModificationType.valueOf(contents[0]); 380 String oldValue = contents[1]; 381 String newValue = contents[2]; 382 // Process remaining options. 383 Map<String, String> options = new HashMap<>(); 384 for (int i = 3; i < contents.length; i++) { 385 String rawLine = contents[i]; 386 int pos = rawLine.indexOf('='); 387 if (pos < 0) { 388 throw new IllegalArgumentException("Invalid option: " + rawLine); 389 } 390 String optionType = rawLine.substring(0, pos).trim(); 391 options.put(optionType, rawLine.substring(pos + 1).trim()); 392 } 393 394 switch (filterType) { 395 case xpath: 396 if (isValidXPath(oldValue)) { 397 pathModifier.addModifierEntry(new ModifierEntry(oldValue, newValue, options)); 398 } else { 399 pathRegexModifier.addModifierEntry(new ModifierEntry(fixXPathRegex(oldValue), 400 newValue, options)); 401 } 402 break; 403 case value: 404 String xpath = options.get("xpath"); 405 if (xpath != null && !isValidXPath(xpath)) { 406 options.put("xpath", fixXPathRegex(xpath)); 407 } 408 valueModifier.addModifierEntry(new ModifierEntry(oldValue, newValue, options)); 409 break; 410 } 411 } 412 }); 413 fileParser.parse(FilterFactory.class, filename); 414 modifiers.add(pathModifier); 415 modifiers.add(pathRegexModifier); 416 modifiers.add(valueModifier); 417 } 418 419 private Pattern XPATH_PATTERN = PatternCache.get("/(/\\w++(\\[@\\w++=\"[^\"()%\\\\]+\"])*)++"); 420 421 /** 422 * @param path 423 * @return true if path is a valid XPath and not a regex. 424 */ isValidXPath(String path)425 private boolean isValidXPath(String path) { 426 return XPATH_PATTERN.matcher(path).matches(); 427 } 428 429 /** 430 * Converts an xpath into a proper regex pattern. 431 * 432 * @param path 433 * @return 434 */ fixXPathRegex(String path)435 private String fixXPathRegex(String path) { 436 return '^' + path.replace("[@", "\\[@"); 437 } 438 439 private static final Options options = new Options( 440 "Filters CLDR XML files according to orgnizational coverage levels and an " + 441 "input file of replacement values/xpaths.") 442 // .add("org", 'o', ".*", "google", "The organization that the filtering is for. If set, also removes duplicate paths.") 443 .add("org", 'o', ".*", Organization.cldr.name(), "The organization that the filtering is for. If set, also removes duplicate paths.") 444 .add("locales", 'l', ".*", ".*", "A regular expression indicating the locales to be filtered"); 445 446 /** 447 * Run FilterFactory for a specific organization. 448 * 449 * @param args 450 * @throws Exception 451 */ main(String[] args)452 public static void main(String[] args) throws Exception { 453 options.parse(args, true); 454 Factory rawFactory = Factory.make(CLDRPaths.MAIN_DIRECTORY, options.get("locales").getValue()); 455 String org = options.get("org").getValue(); 456 FilterFactory filterFactory = FilterFactory.load(rawFactory, org, true); 457 String outputDir = CLDRPaths.GEN_DIRECTORY + "/filter"; 458 for (String locale : rawFactory.getAvailable()) { 459 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, locale + ".xml");) { 460 filterFactory.make(locale, false).write(out); 461 } 462 // out.close(); 463 } 464 } 465 } 466