1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Splitter; 5 import com.google.common.collect.ImmutableSortedSet; 6 import com.ibm.icu.impl.Utility; 7 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 8 import com.ibm.icu.text.UnicodeSet; 9 import java.io.File; 10 import java.io.IOException; 11 import java.io.PrintWriter; 12 import java.util.HashSet; 13 import java.util.LinkedHashMap; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.test.DisplayAndInputProcessor; 23 import org.unicode.cldr.tool.Option.Options; 24 import org.unicode.cldr.tool.Option.Params; 25 import org.unicode.cldr.util.Annotations; 26 import org.unicode.cldr.util.Annotations.AnnotationSet; 27 import org.unicode.cldr.util.CLDRConfig; 28 import org.unicode.cldr.util.CLDRFile; 29 import org.unicode.cldr.util.CLDRPaths; 30 import org.unicode.cldr.util.CldrUtility; 31 import org.unicode.cldr.util.Emoji; 32 import org.unicode.cldr.util.Factory; 33 import org.unicode.cldr.util.Level; 34 import org.unicode.cldr.util.LocaleNames; 35 import org.unicode.cldr.util.Organization; 36 import org.unicode.cldr.util.SimpleXMLSource; 37 import org.unicode.cldr.util.StandardCodes; 38 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 39 40 public class GenerateDerivedAnnotations { 41 // The guts for derived names is in Annotations.synthesize 42 // Run CLDRModify afterwards: see 43 // https://docs.google.com/document/d/1h4xeKyEwCFnFvfN_szExBJcHaHQ8-tgoJTNMPQEpg8Q/edit for 44 // example 45 // Then run 46 // https://sites.google.com/site/cldr/internal-development/cldr-development-site/generate-algorithmic-locales 47 48 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 49 50 static final UnicodeSet SKIP = 51 new UnicodeSet() 52 .add(Annotations.ENGLISH_MARKER) 53 .add(Annotations.BAD_MARKER) 54 .add(Annotations.MISSING_MARKER) 55 .freeze(); 56 57 static Map<String, String> codepointToIsoCurrencyCode; 58 59 static { 60 final Splitter tabSplitter = Splitter.on('\t').trimResults(); 61 Map<String, String> _codepointToIsoCurrencyCode = new TreeMap<>(); 62 for (String line : 63 FileUtilities.in(CldrUtility.class, "data/codepointToIsoCurrencyCode.tsv")) { 64 if (line.startsWith("#")) { 65 continue; 66 } 67 List<String> parts = tabSplitter.splitToList(line); 68 _codepointToIsoCurrencyCode.put(parts.get(0), parts.get(1)); 69 } 70 codepointToIsoCurrencyCode = ImmutableMap.copyOf(_codepointToIsoCurrencyCode); 71 } 72 73 private enum MyOptions { 74 fileFilter( 75 new Params() 76 .setHelp("filter files by dir/locale, eg: ^main/en$ or .*/en") 77 .setMatch(".*") 78 .setDefault(".*")), 79 missing(new Params().setHelp("only missing").setMatch("")), 80 ; 81 82 // BOILERPLATE TO COPY 83 final Option option; 84 MyOptions(Params params)85 private MyOptions(Params params) { 86 option = new Option(this, params); 87 } 88 89 private static Options myOptions = new Options(); 90 91 static { 92 for (MyOptions option : MyOptions.values()) { myOptions.add(option, option.option)93 myOptions.add(option, option.option); 94 } 95 } 96 parse(String[] args)97 private static Set<String> parse(String[] args) { 98 return myOptions.parse(MyOptions.values()[0], args, true); 99 } 100 } 101 main(String[] args)102 public static void main(String[] args) throws IOException { 103 MyOptions.parse(args); 104 105 boolean missingOnly = MyOptions.missing.option.doesOccur(); 106 if (missingOnly) { 107 System.out.println( 108 "With the 'missing' argument files will not be written, only the missing items will be written to the console"); 109 } 110 111 Matcher localeMatcher = Pattern.compile(MyOptions.fileFilter.option.getValue()).matcher(""); 112 Joiner BAR = Joiner.on(" | "); 113 AnnotationSet enAnnotations = Annotations.getDataSet("en"); 114 CLDRFile english = CLDR_CONFIG.getEnglish(); 115 116 UnicodeSet derivables = 117 new UnicodeSet(Emoji.getAllRgiNoES()) 118 .addAll(codepointToIsoCurrencyCode.keySet()) 119 .removeAll(enAnnotations.keySet()) 120 .freeze(); 121 122 for (String d : derivables) { 123 if (d.contains("")) { 124 System.out.println(d + "\t" + Utility.hex(d)); 125 } 126 } 127 128 Map<String, UnicodeSet> localeToFailures = new LinkedHashMap<>(); 129 Set<String> locales = ImmutableSortedSet.copyOf(Annotations.getAvailable()); 130 final Factory cldrFactory = CLDRConfig.getInstance().getCldrFactory(); 131 final Map<String, Integer> failureMap = new TreeMap<>(); 132 int processCount = 0; 133 134 for (String locale : locales) { 135 if (LocaleNames.ROOT.equals(locale)) { 136 continue; 137 } 138 if (!localeMatcher.reset(locale).matches()) { 139 continue; 140 } 141 processCount++; 142 UnicodeSet failures = new UnicodeSet(Emoji.getAllRgiNoES()); 143 localeToFailures.put(locale, failures); 144 145 AnnotationSet annotations; 146 try { 147 annotations = Annotations.getDataSet(locale); 148 failures.removeAll(annotations.getExplicitValues()); 149 } catch (Exception e) { 150 System.out.println( 151 "Can't create annotations for: " + locale + "\n\t" + e.getMessage()); 152 annotations = Annotations.getDataSet(locale); 153 continue; 154 } 155 CLDRFile target = new CLDRFile(new SimpleXMLSource(locale)); 156 CLDRFile main = null; 157 DisplayAndInputProcessor DAIP = new DisplayAndInputProcessor(target); 158 Exception[] internalException = new Exception[1]; 159 160 target.addComment( 161 "//ldml", 162 "Derived short names and annotations, using GenerateDerivedAnnotations.java. See warnings in /annotations/ file.", 163 CommentType.PREBLOCK); 164 for (String derivable : derivables) { 165 String shortName = null; 166 try { 167 shortName = annotations.getShortName(derivable); 168 } catch (Exception e) { 169 } 170 171 if (shortName == null) { 172 String currencyCode = codepointToIsoCurrencyCode.get(derivable); 173 if (currencyCode != null) { 174 if (main == null) { 175 main = cldrFactory.make(locale, true); 176 } 177 shortName = main.getName(CLDRFile.CURRENCY_NAME, currencyCode); 178 if (shortName.contentEquals(currencyCode)) { 179 shortName = null; // don't want fallback raw code 180 } 181 } 182 } 183 184 if (shortName == null || SKIP.containsSome(shortName)) { 185 continue; // missing 186 } 187 Set<String> keywords = annotations.getKeywordsMinus(derivable); 188 String path = "//ldml/annotations/annotation[@cp=\"" + derivable + "\"]"; 189 if (!keywords.isEmpty()) { 190 Set<String> keywordsFixed = new HashSet<>(); 191 for (String keyword : keywords) { 192 if (!SKIP.containsSome(keyword)) { 193 keywordsFixed.add(keyword); 194 } 195 } 196 if (!keywordsFixed.isEmpty()) { 197 String value = BAR.join(keywordsFixed); 198 String newValue = DAIP.processInput(path, value, internalException); 199 target.add(path, newValue); 200 } 201 } 202 failures.remove(derivable); 203 String ttsPath = path + "[@type=\"tts\"]"; 204 String shortName2 = DAIP.processInput(path, shortName, internalException); 205 target.add(ttsPath, shortName2); 206 } 207 failures.freeze(); 208 if (!failures.isEmpty()) { 209 Level level = 210 StandardCodes.make().getLocaleCoverageLevel(Organization.cldr, locale); 211 System.out.println( 212 "Failures\t" 213 + locale 214 + "\t" 215 + level 216 + "\t" 217 + english.getName(locale) 218 + "\t" 219 + failures.size() 220 + "\t" 221 + failures.toPattern(false)); 222 failureMap.put(locale, failures.size()); 223 } 224 if (missingOnly) { 225 continue; 226 } 227 try (PrintWriter pw = 228 FileUtilities.openUTF8Writer( 229 CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml")) { 230 target.write(pw); 231 } 232 } 233 Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", ".*"); 234 for (String locale : locales) { 235 if (LocaleNames.ROOT.equals(locale)) { 236 continue; 237 } 238 if (!localeMatcher.reset(locale).matches()) { 239 continue; 240 } 241 CLDRFile cldrFileUnresolved = factory.make(locale, false); 242 CLDRFile cldrFileResolved = factory.make(locale, true); 243 Set<String> toRemove = new TreeSet<>(); // TreeSet just makes debugging easier 244 boolean gotOne = false; 245 for (String xpath : cldrFileUnresolved) { 246 if (xpath.startsWith("//ldml/identity")) { 247 continue; 248 } 249 250 String value = cldrFileUnresolved.getStringValue(xpath); 251 252 // remove items that are the same as their bailey values. This also catches 253 // Inheritance Marker 254 255 String bailey = cldrFileResolved.getBaileyValue(xpath, null, null); 256 if (value.equals(bailey)) { 257 toRemove.add(xpath); 258 continue; 259 } 260 gotOne = true; 261 } 262 if (!gotOne) { 263 if (locale.equals("sr_Cyrl")) { 264 System.err.println("TODO: keep from deleting files with non-empty children"); 265 } else { 266 System.out.println("Removing empty " + locale); 267 new File(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml") 268 .deleteOnExit(); 269 } 270 } else if (!toRemove.isEmpty()) { 271 System.out.println("Removing " + toRemove.size() + " items from " + locale); 272 CLDRFile fileToWrite = cldrFileUnresolved.cloneAsThawed(); 273 fileToWrite.removeAll(toRemove, false); 274 File file = 275 new File( 276 CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml"); 277 try (PrintWriter pw = new PrintWriter(file)) { 278 fileToWrite.write(pw); 279 } 280 } 281 } 282 System.out.println( 283 "Be sure to run CLDRModify passes afterwards, and generate transformed locales (like de-CH)."); 284 if (!failureMap.isEmpty()) { 285 failureMap 286 .entrySet() 287 .forEach( 288 e -> 289 System.err.printf( 290 "ERROR: %s: %d errors\n", e.getKey(), e.getValue())); 291 System.err.printf("ERROR: Errors in %d/%d locales.\n", failureMap.size(), processCount); 292 System.exit(1); 293 } else if (processCount == 0) { 294 System.err.println("ERROR: No locales matched. Check the -f option.\n"); 295 System.exit(1); 296 } else { 297 System.out.printf("OK: %d locales processed without error\n", processCount); 298 System.exit(0); 299 } 300 } 301 } 302