1 package org.unicode.cldr.util; 2 3 import java.io.File; 4 import java.util.Arrays; 5 import java.util.Collections; 6 import java.util.EnumMap; 7 import java.util.EnumSet; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.LinkedHashSet; 11 import java.util.Set; 12 13 import org.unicode.cldr.draft.ScriptMetadata; 14 import org.unicode.cldr.draft.ScriptMetadata.Info; 15 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 16 import org.unicode.cldr.tool.LikelySubtags; 17 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 18 19 import com.google.common.collect.ImmutableSet; 20 import com.google.common.collect.Multimap; 21 import com.ibm.icu.impl.Relation; 22 import com.ibm.icu.lang.UCharacter; 23 import com.ibm.icu.lang.UCharacterDirection; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.text.UnicodeSetIterator; 27 28 public class CoreCoverageInfo { 29 30 private static final CLDRConfig config = CLDRConfig.getInstance(); 31 private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString(); 32 private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 33 private static final LikelySubtags ls = new LikelySubtags(); 34 35 public enum CoreItems { 36 // Drop the exemplars, since 37 // main_exemplar, auxiliary_exemplar, // numbers_exemplar, punctuation_exemplar, index_exemplar(Level.MODERN) 38 orientation, 39 plurals, 40 default_content, likely_subtags, 41 country_data, 42 casing, 43 collation, 44 romanization(Level.MODERATE), 45 ordinals(Level.MODERN), 46 ; 47 48 public static Set<CoreItems> ONLY_RECOMMENDED = ImmutableSet.copyOf( 49 EnumSet.of(romanization, ordinals)); 50 51 // private static final Set<CoreItems> EXEMPLARS = ImmutableSet.copyOf(EnumSet.of( 52 // main_exemplar, auxiliary_exemplar 53 // //, numbers_exemplar, punctuation_exemplar, index_exemplar 54 // )); 55 56 public static final int COUNT = CoreItems.values().length; 57 public final Level desiredLevel; 58 CoreItems(Level desiredLevel)59 CoreItems(Level desiredLevel) { 60 this.desiredLevel = desiredLevel; 61 } CoreItems()62 CoreItems() { 63 this(Level.CORE); 64 } 65 @Override toString()66 public String toString() { 67 // TODO Auto-generated method stub 68 return name() + (desiredLevel == Level.CORE ? "" : "*"); 69 } 70 } 71 getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors)72 public static Set<CoreItems> getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors) { 73 if (file.isResolved()) { 74 file = file.getUnresolved(); 75 } 76 String locale = file.getLocaleID(); 77 LanguageTagParser ltp = new LanguageTagParser(); 78 locale = ltp.set(locale).getLanguageScript(); 79 String baseLanguage = ltp.getLanguage(); 80 String script = ltp.getScript(); 81 82 Set<CoreItems> result = EnumSet.noneOf(CoreItems.class); 83 84 // (04) Exemplar sets: main, auxiliary, index, punctuation. [main/xxx.xml] 85 // These must reflect the Unicode model. For more information, see tr35-general.html#Character_Elements. 86 boolean isRtl = false; 87 // for (CoreItems exemplar : CoreItems.EXEMPLARS) { 88 // String type = exemplar.toString(); 89 // type = type.substring(0, type.indexOf('_')); 90 // 91 // String path = "//ldml/characters/exemplarCharacters"; 92 // boolean isMain = type.equals("main"); 93 // if (!isMain) { 94 // path += "[@type=\"" + type + "\"]"; 95 // } 96 // String value = file.getStringValue(path); 97 // if (value != null) { 98 // String sourceLocale = file.getSourceLocaleID(path, null); 99 // if (locale.equals(sourceLocale)) { 100 // result.add(exemplar); 101 // } 102 // } else { 103 // detailedErrors.put(exemplar, path); 104 // } 105 // if (isMain && result.contains(exemplar)) { 106 // UnicodeSet main = new UnicodeSet(value); 107 // isRtl = isRtl(main); 108 // } 109 // } 110 // (02) Orientation (bidi writing systems only) [main/xxx.xml] 111 String path = "//ldml/layout/orientation/characterOrder"; 112 String value = file.getStringValue(path); 113 if ("right-to-left".equals(value) == isRtl) { 114 result.add(CoreItems.orientation); 115 } else { 116 detailedErrors.put(CoreItems.orientation, path); 117 } 118 119 // (01) Plural rules [supplemental/plurals.xml and ordinals.xml] 120 // For more information, see cldr-spec/plural-rules. 121 if (sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) { 122 result.add(CoreItems.plurals); 123 } else { 124 detailedErrors.put(CoreItems.plurals, "//supplementalData/plurals[@type=\"cardinal\"]/pluralRules[@locales=\"" + locale 125 + "\"]/pluralRule[@count=\"other\"]"); 126 } 127 if (sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) { 128 result.add(CoreItems.ordinals); 129 } else { 130 detailedErrors.put(CoreItems.ordinals, "//supplementalData/plurals[@type=\"ordinal\"]/pluralRules[@locales=\"" + locale 131 + "\"]/pluralRule[@count=\"other\"]"); 132 } 133 134 // (01) Default content script and region (normally: normally country with largest population using that language, and normal script for that). [supplemental/supplementalMetadata.xml] 135 136 String defaultContent = sdi.getDefaultContentLocale(locale); 137 if (defaultContent != null) { 138 result.add(CoreItems.default_content); 139 } else { 140 detailedErrors.put(CoreItems.default_content, "//supplementalData/supplementalMetadata/defaultContent"); 141 } 142 // likely subtags 143 String max = ls.maximize(locale); 144 String maxLangScript = null; 145 if (max != null) { 146 ltp.set(max); 147 maxLangScript = ltp.getLanguageScript(); 148 script = ltp.getScript(); 149 if (!script.isEmpty() && !ltp.getRegion().isEmpty()) { 150 result.add(CoreItems.likely_subtags); 151 } 152 } 153 if (!result.contains(CoreItems.likely_subtags)) { 154 detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags"); 155 } 156 // (N) Verify the country data ( i.e. which territories in which the language is spoken enough to create a locale ) [supplemental/supplementalData.xml] 157 // we verify that there is at least one region 158 // we try 3 cases: language, locale, maxLangScript 159 Set<String> territories = sdi.getTerritoriesForPopulationData(locale); 160 if (territories == null) { 161 territories = sdi.getTerritoriesForPopulationData(baseLanguage); 162 } 163 if (territories == null && maxLangScript != null) { 164 territories = sdi.getTerritoriesForPopulationData(maxLangScript); 165 } 166 if (territories != null && territories.size() != 0) { 167 result.add(CoreItems.country_data); 168 } else { 169 detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo"); 170 sdi.getTerritoriesForPopulationData(locale); // for debugging 171 } 172 // *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll translate into transforms/xxx-en.xml] 173 // If a spreadsheet, for each letter (or sequence) in the exemplars, what is the corresponding Latin letter (or sequence). 174 // More sophisticated users can do a better job, supplying a file of rules like transforms/Arabic-Latin-BGN.xml. 175 176 if (script.equals("Latn")) { 177 result.add(CoreItems.romanization); 178 } else { 179 boolean found = false; 180 Set<String> scriptNames = getScriptNames(script); 181 Set<String> tempErrors = new LinkedHashSet<>(); 182 for (String scriptName : scriptNames) { 183 for (String[] pair : ROMANIZATION_PATHS) { 184 String filename = pair[0] + scriptName + pair[1]; 185 if (hasFile(SpecialDir.transforms, filename)) { 186 result.add(CoreItems.romanization); 187 found = true; 188 break; 189 } else { 190 tempErrors.add(script); // debugging 191 } 192 } 193 } 194 if (!found) { 195 detailedErrors.put(CoreItems.romanization, "//supplementalData/transforms/transform" 196 + "[@source=\"und-" + script + "\"]" 197 + "[@target=\"und-Latn\"]" 198 //+ "[@direction=\"forward\"]" 199 ); 200 } 201 } 202 203 // (N) Casing information (cased scripts only, according to ScriptMetadata.txt) 204 // This will be in common/casing 205 Info scriptData = ScriptMetadata.getInfo(script); 206 if (scriptData.hasCase == Trinary.YES) { 207 if (hasFile(SpecialDir.casing, baseLanguage)) { 208 result.add(CoreItems.casing); 209 } else { 210 detailedErrors.put(CoreItems.casing, "//ldml/metadata/casingData/casingItem[@type=\"*\"]"); 211 } 212 } else { 213 result.add(CoreItems.casing); 214 } 215 // (N) Collation rules [non-Survey Tool] 216 // For details, see cldr-spec/collation-guidelines. 217 // The result will be a file like: common/collation/ar.xml or common/collation/da.xml. 218 // Note that the "search" collators (which tend to be large) are not needed initially. 219 220 // check for file cldr/collation/<language>.xml 221 if (hasFile(SpecialDir.collation, baseLanguage)) { 222 result.add(CoreItems.collation); 223 } else { 224 detailedErrors.put(CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]"); 225 } 226 return Collections.unmodifiableSet(result); 227 } 228 229 private static final String[][] ROMANIZATION_PATHS = { 230 { "", "-Latin" }, 231 { "", "-Latin-BGN" }, 232 { "Latin-", "" }, 233 }; 234 235 private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class); 236 static { 237 SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab")); 238 SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han")); 239 SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han")); 240 SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han")); 241 SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul")); SCRIPT_NAMES.freeze()242 SCRIPT_NAMES.freeze(); 243 } 244 getScriptNames(String script)245 private static Set<String> getScriptNames(String script) { 246 Set<String> result = SCRIPT_NAMES.get(script); 247 if (result != null) { 248 return result; 249 } 250 result = new HashSet(); 251 String name = UScript.getName(UScript.getCodeFromName(script)); 252 result.add(name); 253 result.add(script); 254 return result; 255 } 256 257 private enum SpecialDir { 258 transforms, collation, casing 259 } 260 261 private static final Relation<SpecialDir, String> SPECIAL_FILES = Relation.of(new EnumMap(SpecialDir.class), HashSet.class); 262 static { 263 for (SpecialDir dir : SpecialDir.values()) { 264 File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir); 265 for (String s : realDir.list()) { 266 if (s.endsWith(".xml")) { 267 s = s.substring(0, s.length() - 4); 268 } SPECIAL_FILES.put(dir, s)269 SPECIAL_FILES.put(dir, s); 270 } 271 } 272 } 273 hasFile(SpecialDir type, String filename)274 private static boolean hasFile(SpecialDir type, String filename) { 275 return SPECIAL_FILES.get(type).contains(filename); 276 } 277 isRtl(UnicodeSet main)278 public static boolean isRtl(UnicodeSet main) { 279 for (UnicodeSetIterator it = new UnicodeSetIterator(main); it.nextRange();) { 280 for (int i = it.codepoint; i <= it.codepointEnd; ++i) { 281 int bidiClass = UCharacter.getDirection(i); 282 switch (bidiClass) { 283 case UCharacterDirection.RIGHT_TO_LEFT: 284 case UCharacterDirection.RIGHT_TO_LEFT_ARABIC: 285 return true; 286 case UCharacterDirection.LEFT_TO_RIGHT: 287 return false; 288 } 289 } 290 } 291 return false; 292 } 293 294 } 295