1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.IOException; 5 import java.io.PrintWriter; 6 import java.util.HashSet; 7 import java.util.LinkedHashMap; 8 import java.util.List; 9 import java.util.Map; 10 import java.util.Set; 11 import java.util.TreeMap; 12 import java.util.TreeSet; 13 import java.util.regex.Matcher; 14 import java.util.regex.Pattern; 15 16 import org.unicode.cldr.draft.FileUtilities; 17 import org.unicode.cldr.test.DisplayAndInputProcessor; 18 import org.unicode.cldr.tool.Option.Options; 19 import org.unicode.cldr.tool.Option.Params; 20 import org.unicode.cldr.util.Annotations; 21 import org.unicode.cldr.util.Annotations.AnnotationSet; 22 import org.unicode.cldr.util.CLDRConfig; 23 import org.unicode.cldr.util.CLDRFile; 24 import org.unicode.cldr.util.CLDRPaths; 25 import org.unicode.cldr.util.CldrUtility; 26 import org.unicode.cldr.util.Emoji; 27 import org.unicode.cldr.util.Factory; 28 import org.unicode.cldr.util.Level; 29 import org.unicode.cldr.util.Organization; 30 import org.unicode.cldr.util.SimpleXMLSource; 31 import org.unicode.cldr.util.StandardCodes; 32 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 33 34 import com.google.common.base.Joiner; 35 import com.google.common.base.Splitter; 36 import com.google.common.collect.ImmutableSortedSet; 37 import com.ibm.icu.impl.Utility; 38 import com.ibm.icu.impl.locale.XCldrStub.ImmutableMap; 39 import com.ibm.icu.text.UnicodeSet; 40 41 public class GenerateDerivedAnnotations { 42 // Use EmojiData.getDerivableNames() to update this for each version of Unicode. 43 44 private static final CLDRConfig CLDR_CONFIG = CLDRConfig.getInstance(); 45 46 static final UnicodeSet SKIP = new UnicodeSet() 47 .add(Annotations.ENGLISH_MARKER) 48 .add(Annotations.BAD_MARKER) 49 .add(Annotations.MISSING_MARKER) 50 .freeze(); 51 52 static Map<String,String> codepointToIsoCurrencyCode; 53 static { 54 final Splitter tabSplitter = Splitter.on('\t').trimResults(); 55 Map<String,String> _codepointToIsoCurrencyCode = new TreeMap<>(); 56 for (String line : FileUtilities.in(CldrUtility.class, "data/codepointToIsoCurrencyCode.tsv")) { 57 if (line.startsWith("#")) { 58 continue; 59 } 60 List<String> parts = tabSplitter.splitToList(line); 61 _codepointToIsoCurrencyCode.put(parts.get(0), parts.get(1)); 62 } 63 codepointToIsoCurrencyCode = ImmutableMap.copyOf(_codepointToIsoCurrencyCode); 64 } 65 66 private enum MyOptions { 67 fileFilter(new Params().setHelp("filter files by dir/locale, eg: ^main/en$ or .*/en").setMatch(".*").setDefault(".*")), 68 missing(new Params().setHelp("only missing").setMatch("")), 69 ; 70 71 // BOILERPLATE TO COPY 72 final Option option; 73 MyOptions(Params params)74 private MyOptions(Params params) { 75 option = new Option(this, params); 76 } 77 78 private static Options myOptions = new Options(); 79 static { 80 for (MyOptions option : MyOptions.values()) { myOptions.add(option, option.option)81 myOptions.add(option, option.option); 82 } 83 } 84 parse(String[] args)85 private static Set<String> parse(String[] args) { 86 return myOptions.parse(MyOptions.values()[0], args, true); 87 } 88 } 89 main(String[] args)90 public static void main(String[] args) throws IOException { 91 MyOptions.parse(args); 92 93 boolean missingOnly = MyOptions.missing.option.doesOccur(); 94 if (missingOnly) { 95 System.out.println("With the 'missing' argument files will not be written, only the missing items will be written to the console"); 96 } 97 98 Matcher localeMatcher = Pattern.compile(MyOptions.fileFilter.option.getValue()).matcher(""); 99 Joiner BAR = Joiner.on(" | "); 100 AnnotationSet enAnnotations = Annotations.getDataSet("en"); 101 CLDRFile english = CLDR_CONFIG.getEnglish(); 102 103 UnicodeSet derivables = new UnicodeSet(Emoji.getAllRgiNoES()) 104 .addAll(codepointToIsoCurrencyCode.keySet()) 105 .removeAll(enAnnotations.keySet()) 106 .freeze(); 107 108 for (String d : derivables) { 109 if (d.contains("")) { 110 System.out.println(d + "\t" + Utility.hex(d)); 111 } 112 } 113 114 Map<String, UnicodeSet> localeToFailures = new LinkedHashMap<>(); 115 Set<String> locales = ImmutableSortedSet.copyOf(Annotations.getAvailable()); 116 final Factory cldrFactory = CLDRConfig.getInstance().getCldrFactory(); 117 final Map<String, Integer> failureMap = new TreeMap<>(); 118 int processCount = 0; 119 120 for (String locale : locales) { 121 if ("root".equals(locale)) { 122 continue; 123 } 124 if (!localeMatcher.reset(locale).matches()) { 125 continue; 126 } 127 processCount++; 128 UnicodeSet failures = new UnicodeSet(Emoji.getAllRgiNoES()); 129 localeToFailures.put(locale, failures); 130 131 AnnotationSet annotations; 132 try { 133 annotations = Annotations.getDataSet(locale); 134 failures.removeAll(annotations.getExplicitValues()); 135 } catch (Exception e) { 136 System.out.println("Can't create annotations for: " + locale + "\n\t" + e.getMessage()); 137 annotations = Annotations.getDataSet(locale); 138 continue; 139 } 140 CLDRFile target = new CLDRFile(new SimpleXMLSource(locale)); 141 CLDRFile main = null; 142 DisplayAndInputProcessor DAIP = new DisplayAndInputProcessor(target); 143 Exception[] internalException = new Exception[1]; 144 145 target.addComment("//ldml", "Derived short names and annotations, using GenerateDerivedAnnotations.java. See warnings in /annotations/ file.", 146 CommentType.PREBLOCK); 147 for (String derivable : derivables) { 148 String shortName = null; 149 try { 150 shortName = annotations.getShortName(derivable); 151 } catch (Exception e) { 152 } 153 154 if (shortName == null) { 155 String currencyCode = codepointToIsoCurrencyCode.get(derivable); 156 if (currencyCode != null) { 157 if (main == null) { 158 main = cldrFactory.make(locale, true); 159 } 160 shortName = main.getName(CLDRFile.CURRENCY_NAME, currencyCode); 161 if (shortName.contentEquals(currencyCode)) { 162 shortName = null; // don't want fallback raw code 163 } 164 } 165 } 166 167 if (shortName == null || SKIP.containsSome(shortName)) { 168 continue; // missing 169 } 170 Set<String> keywords = annotations.getKeywordsMinus(derivable); 171 String path = "//ldml/annotations/annotation[@cp=\"" + derivable + "\"]"; 172 if (!keywords.isEmpty()) { 173 Set<String> keywordsFixed = new HashSet<>(); 174 for (String keyword : keywords) { 175 if (!SKIP.containsSome(keyword)) { 176 keywordsFixed.add(keyword); 177 } 178 } 179 if (!keywordsFixed.isEmpty()) { 180 String value = BAR.join(keywordsFixed); 181 String newValue = DAIP.processInput(path, value, internalException); 182 target.add(path, newValue); 183 } 184 } 185 failures.remove(derivable); 186 String ttsPath = path + "[@type=\"tts\"]"; 187 String shortName2 = DAIP.processInput(path, shortName, internalException); 188 target.add(ttsPath, shortName2); 189 } 190 failures.freeze(); 191 if (!failures.isEmpty()) { 192 Level level = StandardCodes.make().getLocaleCoverageLevel(Organization.cldr, locale); 193 System.out.println("Failures\t" + locale 194 + "\t" + level 195 + "\t" + english.getName(locale) 196 + "\t" + failures.size() 197 + "\t" + failures.toPattern(false)); 198 failureMap.put(locale, failures.size()); 199 } 200 if (missingOnly) { 201 continue; 202 } 203 try (PrintWriter pw = FileUtilities.openUTF8Writer(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml")) { 204 target.write(pw); 205 } 206 } 207 Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", ".*"); 208 for (String locale : locales) { 209 if ("root".equals(locale)) { 210 continue; 211 } 212 if (!localeMatcher.reset(locale).matches()) { 213 continue; 214 } 215 CLDRFile cldrFileUnresolved = factory.make(locale, false); 216 CLDRFile cldrFileResolved = factory.make(locale, true); 217 Set<String> toRemove = new TreeSet<>(); // TreeSet just makes debugging easier 218 boolean gotOne = false; 219 for (String xpath : cldrFileUnresolved) { 220 if (xpath.startsWith("//ldml/identity")) { 221 continue; 222 } 223 224 String value = cldrFileUnresolved.getStringValue(xpath); 225 226 // remove items that are the same as their bailey values. This also catches Inheritance Marker 227 228 String bailey = cldrFileResolved.getBaileyValue(xpath, null, null); 229 if (value.equals(bailey)) { 230 toRemove.add(xpath); 231 continue; 232 } 233 gotOne = true; 234 } 235 if (!gotOne) { 236 if (locale.equals("sr_Cyrl")) { 237 System.err.println("TODO: keep from deleting files with non-empty children"); 238 } else { 239 System.out.println("Removing empty " + locale); 240 new File(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml").deleteOnExit(); 241 } 242 } else if (!toRemove.isEmpty()) { 243 System.out.println("Removing " + toRemove.size() + " items from " + locale); 244 CLDRFile fileToWrite = cldrFileUnresolved.cloneAsThawed(); 245 fileToWrite.removeAll(toRemove, false); 246 File file = new File(CLDRPaths.COMMON_DIRECTORY + "annotationsDerived", locale + ".xml"); 247 try (PrintWriter pw = new PrintWriter(file)) { 248 fileToWrite.write(pw); 249 } 250 } 251 } 252 System.out.println("Be sure to run CLDRModify passes afterwards, and generate transformed locales (like de-CH)."); 253 if (!failureMap.isEmpty()) { 254 failureMap.entrySet().forEach(e -> System.err.printf("ERROR: %s: %d errors\n", e.getKey(), e.getValue())); 255 System.err.printf("ERROR: Errors in %d/%d locales.\n", failureMap.size(), processCount); 256 System.exit(1); 257 } else if(processCount == 0) { 258 System.err.println("ERROR: No locales matched. Check the -f option.\n"); 259 System.exit(1); 260 } else { 261 System.out.printf("OK: %d locales processed without error\n", processCount); 262 System.exit(0); 263 } 264 } 265 } 266