1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Splitter; 5 import com.google.common.collect.ImmutableSortedSet; 6 import com.ibm.icu.impl.Utility; 7 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 8 import com.ibm.icu.text.UnicodeSet; 9 import java.io.IOException; 10 import java.util.HashSet; 11 import java.util.LinkedHashMap; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Set; 15 import java.util.TreeMap; 16 import java.util.TreeSet; 17 import java.util.regex.Matcher; 18 import java.util.regex.Pattern; 19 import org.unicode.cldr.draft.FileUtilities; 20 import org.unicode.cldr.test.DisplayAndInputProcessor; 21 import org.unicode.cldr.tool.Option.Options; 22 import org.unicode.cldr.tool.Option.Params; 23 import org.unicode.cldr.util.Annotations; 24 import org.unicode.cldr.util.Annotations.AnnotationSet; 25 import org.unicode.cldr.util.CLDRConfig; 26 import org.unicode.cldr.util.CLDRFile; 27 import org.unicode.cldr.util.CLDRPaths; 28 import org.unicode.cldr.util.CLDRTreeWriter; 29 import org.unicode.cldr.util.CldrUtility; 30 import org.unicode.cldr.util.Emoji; 31 import org.unicode.cldr.util.Factory; 32 import org.unicode.cldr.util.Level; 33 import org.unicode.cldr.util.LocaleNames; 34 import org.unicode.cldr.util.Organization; 35 import org.unicode.cldr.util.SimpleXMLSource; 36 import org.unicode.cldr.util.StandardCodes; 37 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 38 39 public class GenerateDerivedAnnotations { 40 // The guts for derived names is in Annotations.synthesize 41 // Run CLDRModify afterwards: see 42 // https://docs.google.com/document/d/1h4xeKyEwCFnFvfN_szExBJcHaHQ8-tgoJTNMPQEpg8Q/edit for 43 // example 44 // Then run 45 // https://sites.google.com/site/cldr/internal-development/cldr-development-site/generate-algorithmic-locales 46 47 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 48 49 static final UnicodeSet SKIP = 50 new UnicodeSet() 51 .add(Annotations.ENGLISH_MARKER) 52 .add(Annotations.BAD_MARKER) 53 .add(Annotations.MISSING_MARKER) 54 .freeze(); 55 56 static Map<String, String> codepointToIsoCurrencyCode; 57 58 static { 59 final Splitter tabSplitter = Splitter.on('\t').trimResults(); 60 Map<String, String> _codepointToIsoCurrencyCode = new TreeMap<>(); 61 for (String line : 62 FileUtilities.in(CldrUtility.class, "data/codepointToIsoCurrencyCode.tsv")) { 63 if (line.startsWith("#")) { 64 continue; 65 } 66 List<String> parts = tabSplitter.splitToList(line); 67 _codepointToIsoCurrencyCode.put(parts.get(0), parts.get(1)); 68 } 69 codepointToIsoCurrencyCode = ImmutableMap.copyOf(_codepointToIsoCurrencyCode); 70 } 71 72 private enum MyOptions { 73 fileFilter( 74 new Params() 75 .setHelp("filter files by dir/locale, eg: ^main/en$ or .*/en") 76 .setMatch(".*") 77 .setDefault(".*")), 78 missing(new Params().setHelp("only missing").setMatch("")), 79 ; 80 81 // BOILERPLATE TO COPY 82 final Option option; 83 MyOptions(Params params)84 private MyOptions(Params params) { 85 option = new Option(this, params); 86 } 87 88 private static Options myOptions = new Options(); 89 90 static { 91 for (MyOptions option : MyOptions.values()) { myOptions.add(option, option.option)92 myOptions.add(option, option.option); 93 } 94 } 95 parse(String[] args)96 private static Set<String> parse(String[] args) { 97 return myOptions.parse(MyOptions.values()[0], args, true); 98 } 99 } 100 main(String[] args)101 public static void main(String[] args) throws IOException { 102 MyOptions.parse(args); 103 104 boolean missingOnly = MyOptions.missing.option.doesOccur(); 105 if (missingOnly) { 106 System.out.println( 107 "With the 'missing' argument files will not be written, only the missing items will be written to the console"); 108 } 109 110 Matcher localeMatcher = Pattern.compile(MyOptions.fileFilter.option.getValue()).matcher(""); 111 Joiner BAR = Joiner.on(" | "); 112 AnnotationSet enAnnotations = Annotations.getDataSet("en"); 113 CLDRFile english = CLDR_CONFIG.getEnglish(); 114 115 UnicodeSet derivables = 116 new UnicodeSet(Emoji.getAllRgiNoES()) 117 .addAll(codepointToIsoCurrencyCode.keySet()) 118 .removeAll(enAnnotations.keySet()) 119 .freeze(); 120 121 for (String d : derivables) { 122 if (d.contains("")) { 123 System.out.println(d + "\t" + Utility.hex(d)); 124 } 125 } 126 127 Map<String, UnicodeSet> localeToFailures = new LinkedHashMap<>(); 128 Set<String> locales = ImmutableSortedSet.copyOf(Annotations.getAvailable()); 129 final Factory cldrFactory = CLDRConfig.getInstance().getCldrFactory(); 130 final Map<String, Integer> failureMap = new TreeMap<>(); 131 int processCount = 0; 132 133 CLDRTreeWriter treeWriter = 134 new CLDRTreeWriter(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived"); 135 136 for (String locale : locales) { 137 if (LocaleNames.ROOT.equals(locale)) { 138 continue; 139 } 140 if (!localeMatcher.reset(locale).matches()) { 141 continue; 142 } 143 processCount++; 144 UnicodeSet failures = new UnicodeSet(Emoji.getAllRgiNoES()); 145 localeToFailures.put(locale, failures); 146 147 AnnotationSet annotations; 148 try { 149 annotations = Annotations.getDataSet(locale); 150 failures.removeAll(annotations.getExplicitValues()); 151 } catch (Exception e) { 152 System.out.println( 153 "Can't create annotations for: " + locale + "\n\t" + e.getMessage()); 154 annotations = Annotations.getDataSet(locale); 155 continue; 156 } 157 CLDRFile target = new CLDRFile(new SimpleXMLSource(locale)); 158 CLDRFile main = null; 159 DisplayAndInputProcessor DAIP = new DisplayAndInputProcessor(target); 160 Exception[] internalException = new Exception[1]; 161 162 target.addComment( 163 "//ldml", 164 "Derived short names and annotations, using GenerateDerivedAnnotations.java. See warnings in /annotations/ file.", 165 CommentType.PREBLOCK); 166 for (String derivable : derivables) { 167 String shortName = null; 168 try { 169 shortName = annotations.getShortName(derivable); 170 } catch (Exception e) { 171 } 172 173 if (shortName == null) { 174 String currencyCode = codepointToIsoCurrencyCode.get(derivable); 175 if (currencyCode != null) { 176 if (main == null) { 177 main = cldrFactory.make(locale, true); 178 } 179 shortName = main.getName(CLDRFile.CURRENCY_NAME, currencyCode); 180 if (shortName.contentEquals(currencyCode)) { 181 shortName = null; // don't want fallback raw code 182 } 183 } 184 } 185 186 if (shortName == null || SKIP.containsSome(shortName)) { 187 continue; // missing 188 } 189 Set<String> keywords = annotations.getKeywordsMinus(derivable); 190 String path = "//ldml/annotations/annotation[@cp=\"" + derivable + "\"]"; 191 if (!keywords.isEmpty()) { 192 Set<String> keywordsFixed = new HashSet<>(); 193 for (String keyword : keywords) { 194 if (!SKIP.containsSome(keyword)) { 195 keywordsFixed.add(keyword); 196 } 197 } 198 if (!keywordsFixed.isEmpty()) { 199 String value = BAR.join(keywordsFixed); 200 String newValue = DAIP.processInput(path, value, internalException); 201 target.add(path, newValue); 202 } 203 } 204 failures.remove(derivable); 205 String ttsPath = path + "[@type=\"tts\"]"; 206 String shortName2 = DAIP.processInput(path, shortName, internalException); 207 target.add(ttsPath, shortName2); 208 } 209 failures.freeze(); 210 if (!failures.isEmpty()) { 211 Level level = 212 StandardCodes.make().getLocaleCoverageLevel(Organization.cldr, locale); 213 System.out.println( 214 "Failures\t" 215 + locale 216 + "\t" 217 + level 218 + "\t" 219 + english.getName(locale) 220 + "\t" 221 + failures.size() 222 + "\t" 223 + failures.toPattern(false)); 224 failureMap.put(locale, failures.size()); 225 } 226 if (missingOnly) { 227 continue; 228 } 229 230 treeWriter.write(target); 231 } 232 Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", ".*"); 233 for (String locale : locales) { 234 if (LocaleNames.ROOT.equals(locale)) { 235 continue; 236 } 237 if (!localeMatcher.reset(locale).matches()) { 238 continue; 239 } 240 CLDRFile cldrFileUnresolved = factory.make(locale, false); 241 CLDRFile cldrFileResolved = factory.make(locale, true); 242 Set<String> toRemove = new TreeSet<>(); // TreeSet just makes debugging easier 243 boolean gotOne = false; 244 for (String xpath : cldrFileUnresolved) { 245 if (xpath.startsWith("//ldml/identity")) { 246 continue; 247 } 248 249 String value = cldrFileUnresolved.getStringValue(xpath); 250 251 // remove items that are the same as their bailey values. This also catches 252 // Inheritance Marker 253 254 String bailey = cldrFileResolved.getBaileyValue(xpath, null, null); 255 if (value.equals(bailey)) { 256 toRemove.add(xpath); 257 continue; 258 } 259 gotOne = true; 260 } 261 if (!gotOne) { 262 if (locale.equals("sr_Cyrl")) { 263 System.err.println("TODO: keep from deleting files with non-empty children"); 264 } else { 265 System.out.println("Removing empty " + locale); 266 treeWriter.delete(locale); 267 } 268 } else if (!toRemove.isEmpty()) { 269 System.out.println("Removing " + toRemove.size() + " items from " + locale); 270 CLDRFile fileToWrite = cldrFileUnresolved.cloneAsThawed(); 271 fileToWrite.removeAll(toRemove, false); 272 treeWriter.write(fileToWrite); 273 } 274 } 275 treeWriter.close(); 276 System.out.println( 277 "Be sure to run CLDRModify passes afterwards, and generate transformed locales (like de-CH)."); 278 if (!failureMap.isEmpty()) { 279 failureMap 280 .entrySet() 281 .forEach( 282 e -> 283 System.err.printf( 284 "ERROR: %s: %d errors\n", e.getKey(), e.getValue())); 285 System.err.printf("ERROR: Errors in %d/%d locales.\n", failureMap.size(), processCount); 286 System.exit(1); 287 } else if (processCount == 0) { 288 System.err.println("ERROR: No locales matched. Check the -f option.\n"); 289 System.exit(1); 290 } else { 291 System.out.printf("OK: %d locales processed without error\n", processCount); 292 System.exit(0); 293 } 294 } 295 } 296