1 package org.unicode.cldr.util; 2 3 import java.io.File; 4 import java.util.Arrays; 5 import java.util.EnumMap; 6 import java.util.EnumSet; 7 import java.util.HashMap; 8 import java.util.HashSet; 9 import java.util.LinkedHashSet; 10 import java.util.Map; 11 import java.util.Set; 12 13 import org.unicode.cldr.draft.ScriptMetadata; 14 import org.unicode.cldr.draft.ScriptMetadata.Info; 15 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 16 import org.unicode.cldr.tool.LikelySubtags; 17 import org.unicode.cldr.util.CLDRFile.ExemplarType; 18 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 19 20 import com.google.common.collect.ImmutableSet; 21 import com.google.common.collect.Multimap; 22 import com.ibm.icu.impl.Relation; 23 import com.ibm.icu.lang.UScript; 24 import com.ibm.icu.text.UnicodeSet; 25 26 public class CoreCoverageInfo { 27 28 private static final CLDRConfig config = CLDRConfig.getInstance(); 29 private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString(); 30 private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 31 private static final LikelySubtags ls = new LikelySubtags(); 32 33 public enum CoreItems { 34 default_content(Level.CORE), 35 likely_subtags(Level.CORE), 36 country_data(Level.CORE), 37 orientation(Level.CORE), 38 time_cycle(Level.CORE), 39 40 // time cycle 41 42 casing(Level.MODERATE), 43 plurals(Level.MODERATE), 44 ordinals(Level.MODERATE), 45 collation(Level.MODERATE), 46 47 grammar(Level.MODERN), 48 romanization(Level.MODERN), 49 ; 50 51 public static Set<CoreItems> ONLY_RECOMMENDED = ImmutableSet.copyOf( 52 EnumSet.of(romanization, ordinals)); 53 54 public static final int COUNT = CoreItems.values().length; 55 public final Level desiredLevel; 56 CoreItems(Level desiredLevel)57 CoreItems(Level desiredLevel) { 58 this.desiredLevel = desiredLevel; 59 } CoreItems()60 CoreItems() { 61 this(Level.CORE); 62 } 63 @Override toString()64 public String toString() { 65 return desiredLevel.getAbbreviation() + " " + name(); 66 } 67 } 68 static UnicodeSet RTL = new UnicodeSet("[[:bc=R:][:bc=AL:]]").freeze(); 69 getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors)70 public static Set<CoreItems> getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors) { 71 detailedErrors.clear(); 72 if (file.isResolved()) { 73 file = file.getUnresolved(); 74 } 75 String locale = file.getLocaleID(); 76 LanguageTagParser ltp = new LanguageTagParser(); 77 locale = ltp.set(locale).getLanguageScript(); 78 String baseLanguage = ltp.getLanguage(); 79 String script = ltp.getScript(); 80 String region = ltp.getRegion(); 81 82 Set<CoreItems> result = EnumSet.noneOf(CoreItems.class); 83 84 // (02) Orientation (bidi writing systems only) [main/xxx.xml] 85 UnicodeSet main = file.getExemplarSet(ExemplarType.main, null); 86 boolean isRtl = main.containsSome(RTL); 87 88 String path = "//ldml/layout/orientation/characterOrder"; 89 String value = file.getStringValue(path); 90 if ("right-to-left".equals(value) == isRtl) { 91 result.add(CoreItems.orientation); 92 } else { 93 detailedErrors.put(CoreItems.orientation, path); 94 } 95 96 // (01) Plural rules [supplemental/plurals.xml and ordinals.xml] 97 // For more information, see cldr-spec/plural-rules. 98 if (sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) { 99 result.add(CoreItems.plurals); 100 } else { 101 detailedErrors.put(CoreItems.plurals, "//supplementalData/plurals[@type=\"cardinal\"]/pluralRules[@locales=\"" + locale 102 + "\"]/pluralRule[@count=\"other\"]"); 103 } 104 if (sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) { 105 result.add(CoreItems.ordinals); 106 } else { 107 detailedErrors.put(CoreItems.ordinals, "//supplementalData/plurals[@type=\"ordinal\"]/pluralRules[@locales=\"" + locale 108 + "\"]/pluralRule[@count=\"other\"]"); 109 } 110 111 // (01) Default content script and region (normally: normally country with largest population using that language, and normal script for that). [supplemental/supplementalMetadata.xml] 112 113 String defaultContent = sdi.getDefaultContentLocale(locale); 114 if (defaultContent != null || locale.equals("no")) { 115 result.add(CoreItems.default_content); 116 } else { 117 detailedErrors.put(CoreItems.default_content, "//supplementalData/supplementalMetadata/defaultContent"); 118 } 119 // likely subtags 120 String max = ls.maximize(locale); 121 String maxLangScript = null; 122 if (max != null) { 123 ltp.set(max); 124 maxLangScript = ltp.getLanguageScript(); 125 script = ltp.getScript(); 126 region = ltp.getRegion(); 127 if (!script.isEmpty() && !region.isEmpty()) { 128 result.add(CoreItems.likely_subtags); 129 } 130 } 131 if (!result.contains(CoreItems.likely_subtags)) { 132 detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags"); 133 } 134 // (N) Verify the country data ( i.e. which territories in which the language is spoken enough to create a locale ) [supplemental/supplementalData.xml] 135 // we verify that there is at least one region 136 // we try 3 cases: language, locale, maxLangScript 137 Set<String> territories = sdi.getTerritoriesForPopulationData(locale); 138 if (territories == null) { 139 territories = sdi.getTerritoriesForPopulationData(baseLanguage); 140 } 141 if (territories == null && maxLangScript != null) { 142 territories = sdi.getTerritoriesForPopulationData(maxLangScript); 143 } 144 if (territories != null && territories.size() != 0) { 145 result.add(CoreItems.country_data); 146 } else { 147 detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo"); 148 sdi.getTerritoriesForPopulationData(locale); // for debugging 149 } 150 // *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll translate into transforms/xxx-en.xml] 151 // If a spreadsheet, for each letter (or sequence) in the exemplars, what is the corresponding Latin letter (or sequence). 152 // More sophisticated users can do a better job, supplying a file of rules like transforms/Arabic-Latin-BGN.xml. 153 154 if (script.equals("Latn")) { 155 result.add(CoreItems.romanization); 156 } else { 157 boolean found = false; 158 Set<String> scriptNames = getScriptNames(script); 159 Set<String> tempErrors = new LinkedHashSet<>(); 160 for (String scriptName : scriptNames) { 161 for (String[] pair : ROMANIZATION_PATHS) { 162 String filename = pair[0] + scriptName + pair[1]; 163 if (hasFile(SpecialDir.transforms, filename)) { 164 result.add(CoreItems.romanization); 165 found = true; 166 break; 167 } else { 168 tempErrors.add(script); // debugging 169 } 170 } 171 } 172 if (!found) { 173 detailedErrors.put(CoreItems.romanization, "//supplementalData/transforms/transform" 174 + "[@source=\"und-" + script + "\"]" 175 + "[@target=\"und-Latn\"]" 176 //+ "[@direction=\"forward\"]" 177 ); 178 } 179 } 180 181 // (N) Casing information (cased scripts only, according to ScriptMetadata.txt) 182 // This will be in common/casing 183 Info scriptData = ScriptMetadata.getInfo(script); 184 if (scriptData.hasCase == Trinary.YES) { 185 if (hasFile(SpecialDir.casing, baseLanguage)) { 186 result.add(CoreItems.casing); 187 } else { 188 detailedErrors.put(CoreItems.casing, "//ldml/metadata/casingData/casingItem[@type=\"*\"]"); 189 } 190 } else { 191 result.add(CoreItems.casing); 192 } 193 // (N) Collation rules [non-Survey Tool] 194 // For details, see cldr-spec/collation-guidelines. 195 // The result will be a file like: common/collation/ar.xml or common/collation/da.xml. 196 // Note that the "search" collators (which tend to be large) are not needed initially. 197 198 // check for file cldr/collation/<language>.xml 199 if (hasFile(SpecialDir.collation, baseLanguage)) { 200 result.add(CoreItems.collation); 201 } else { 202 detailedErrors.put(CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]"); 203 } 204 205 Map<String, PreferredAndAllowedHour> timeData = sdi.getTimeData(); 206 if (timeData.get(region) != null) { 207 result.add(CoreItems.time_cycle); 208 } else { 209 detailedErrors.put(CoreItems.time_cycle, "//supplementalData/timeData/hours"); 210 } 211 212 GrammarInfo grammarInfo = sdi.getGrammarInfo(locale); 213 if (grammarInfo != null) { 214 result.add(CoreItems.grammar); 215 } else { 216 detailedErrors.put(CoreItems.grammar, "//supplementalData/grammaticalData/grammaticalFeatures"); 217 } 218 219 // finalize 220 return ImmutableSet.copyOf(result); 221 } 222 223 private static final String[][] ROMANIZATION_PATHS = { 224 { "", "-Latin" }, 225 { "", "-Latin-BGN" }, 226 { "Latin-", "" }, 227 }; 228 229 private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class); 230 static { 231 SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab")); 232 SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han")); 233 SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han")); 234 SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han")); 235 SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul")); SCRIPT_NAMES.freeze()236 SCRIPT_NAMES.freeze(); 237 } 238 getScriptNames(String script)239 private static Set<String> getScriptNames(String script) { 240 Set<String> result = SCRIPT_NAMES.get(script); 241 if (result != null) { 242 return result; 243 } 244 result = new HashSet(); 245 String name = UScript.getName(UScript.getCodeFromName(script)); 246 result.add(name); 247 result.add(script); 248 return result; 249 } 250 251 private enum SpecialDir { 252 transforms, collation, casing 253 } 254 255 private static final Relation<SpecialDir, String> SPECIAL_FILES = Relation.of(new EnumMap(SpecialDir.class), HashSet.class); 256 static { 257 for (SpecialDir dir : SpecialDir.values()) { 258 File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir); 259 for (String s : realDir.list()) { 260 if (s.endsWith(".xml")) { 261 s = s.substring(0, s.length() - 4); 262 } SPECIAL_FILES.put(dir, s)263 SPECIAL_FILES.put(dir, s); 264 } 265 } 266 } 267 hasFile(SpecialDir type, String filename)268 private static boolean hasFile(SpecialDir type, String filename) { 269 return SPECIAL_FILES.get(type).contains(filename); 270 } 271 } 272