1 package org.unicode.cldr.util; 2 3 import java.io.File; 4 import java.util.Arrays; 5 import java.util.Collections; 6 import java.util.EnumMap; 7 import java.util.EnumSet; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.LinkedHashSet; 11 import java.util.Set; 12 13 import org.unicode.cldr.draft.ScriptMetadata; 14 import org.unicode.cldr.draft.ScriptMetadata.Info; 15 import org.unicode.cldr.draft.ScriptMetadata.Trinary; 16 import org.unicode.cldr.tool.LikelySubtags; 17 import org.unicode.cldr.util.SupplementalDataInfo.PluralType; 18 19 import com.google.common.collect.ImmutableSet; 20 import com.google.common.collect.Multimap; 21 import com.ibm.icu.impl.Relation; 22 import com.ibm.icu.lang.UCharacter; 23 import com.ibm.icu.lang.UCharacterDirection; 24 import com.ibm.icu.lang.UScript; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.text.UnicodeSetIterator; 27 28 public class CoreCoverageInfo { 29 30 private static final CLDRConfig config = CLDRConfig.getInstance(); 31 private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString(); 32 private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance(); 33 private static final LikelySubtags ls = new LikelySubtags(sdi); 34 35 public enum CoreItems { 36 // Drop the exemplars, since 37 // main_exemplar, auxiliary_exemplar, // numbers_exemplar, punctuation_exemplar, index_exemplar(Level.MODERN) 38 orientation, 39 plurals, 40 default_content, likely_subtags, 41 country_data, 42 casing, 43 collation, 44 romanization(Level.MODERATE), 45 ordinals(Level.MODERN), 46 ; 47 48 public static Set<CoreItems> ONLY_RECOMMENDED = ImmutableSet.copyOf( 49 EnumSet.of(romanization, ordinals)); 50 51 // private static final Set<CoreItems> EXEMPLARS = ImmutableSet.copyOf(EnumSet.of( 52 // main_exemplar, auxiliary_exemplar 53 // //, numbers_exemplar, punctuation_exemplar, index_exemplar 54 // )); 55 56 public static final int COUNT = CoreItems.values().length; 57 public final Level desiredLevel; 58 CoreItems(Level desiredLevel)59 CoreItems(Level desiredLevel) { 60 this.desiredLevel = desiredLevel; 61 } CoreItems()62 CoreItems() { 63 this(Level.CORE); 64 } 65 @Override toString()66 public String toString() { 67 // TODO Auto-generated method stub 68 return name() + (desiredLevel == Level.CORE ? "" : "*"); 69 } 70 } 71 getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors)72 public static Set<CoreItems> getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors) { 73 if (file.isResolved()) { 74 file = file.getUnresolved(); 75 } 76 String locale = file.getLocaleID(); 77 LanguageTagParser ltp = new LanguageTagParser(); 78 locale = ltp.set(locale).getLanguageScript(); 79 String baseLanguage = ltp.getLanguage(); 80 String script = ltp.getScript(); 81 82 Set<CoreItems> result = EnumSet.noneOf(CoreItems.class); 83 84 // (04) Exemplar sets: main, auxiliary, index, punctuation. [main/xxx.xml] 85 // These must reflect the Unicode model. For more information, see tr35-general.html#Character_Elements. 86 boolean isRtl = false; 87 // for (CoreItems exemplar : CoreItems.EXEMPLARS) { 88 // String type = exemplar.toString(); 89 // type = type.substring(0, type.indexOf('_')); 90 // 91 // String path = "//ldml/characters/exemplarCharacters"; 92 // boolean isMain = type.equals("main"); 93 // if (!isMain) { 94 // path += "[@type=\"" + type + "\"]"; 95 // } 96 // String value = file.getStringValue(path); 97 // if (value != null) { 98 // String sourceLocale = file.getSourceLocaleID(path, null); 99 // if (locale.equals(sourceLocale)) { 100 // result.add(exemplar); 101 // } 102 // } else { 103 // detailedErrors.put(exemplar, path); 104 // } 105 // if (isMain && result.contains(exemplar)) { 106 // UnicodeSet main = new UnicodeSet(value); 107 // isRtl = isRtl(main); 108 // } 109 // } 110 // (02) Orientation (bidi writing systems only) [main/xxx.xml] 111 String path = "//ldml/layout/orientation/characterOrder"; 112 String value = file.getStringValue(path); 113 if ("right-to-left".equals(value) == isRtl) { 114 result.add(CoreItems.orientation); 115 } else { 116 detailedErrors.put(CoreItems.orientation, path); 117 } 118 119 // (01) Plural rules [supplemental/plurals.xml and ordinals.xml] 120 // For more information, see cldr-spec/plural-rules. 121 if (sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) { 122 result.add(CoreItems.plurals); 123 } else { 124 detailedErrors.put(CoreItems.plurals, "//supplementalData/plurals[@type=\"cardinal\"]"); 125 } 126 if (sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) { 127 result.add(CoreItems.ordinals); 128 } else { 129 detailedErrors.put(CoreItems.ordinals, "//supplementalData/plurals[@type=\"ordinal\"]"); 130 } 131 132 // (01) Default content script and region (normally: normally country with largest population using that language, and normal script for that). [supplemental/supplementalMetadata.xml] 133 134 String defaultContent = sdi.getDefaultContentLocale(locale); 135 if (defaultContent != null) { 136 result.add(CoreItems.default_content); 137 } else { 138 detailedErrors.put(CoreItems.default_content, "//supplementalData/supplementalMetadata/defaultContent"); 139 } 140 // likely subtags 141 String max = ls.maximize(locale); 142 String maxLangScript = null; 143 if (max != null) { 144 ltp.set(max); 145 maxLangScript = ltp.getLanguageScript(); 146 script = ltp.getScript(); 147 if (!script.isEmpty() && !ltp.getRegion().isEmpty()) { 148 result.add(CoreItems.likely_subtags); 149 } 150 } 151 if (!result.contains(CoreItems.likely_subtags)) { 152 detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags"); 153 } 154 // (N) Verify the country data ( i.e. which territories in which the language is spoken enough to create a locale ) [supplemental/supplementalData.xml] 155 // we verify that there is at least one region 156 // we try 3 cases: language, locale, maxLangScript 157 Set<String> territories = sdi.getTerritoriesForPopulationData(locale); 158 if (territories == null) { 159 territories = sdi.getTerritoriesForPopulationData(baseLanguage); 160 } 161 if (territories == null && maxLangScript != null) { 162 territories = sdi.getTerritoriesForPopulationData(maxLangScript); 163 } 164 if (territories != null && territories.size() != 0) { 165 result.add(CoreItems.country_data); 166 } else { 167 detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo"); 168 sdi.getTerritoriesForPopulationData(locale); // for debugging 169 } 170 // *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll translate into transforms/xxx-en.xml] 171 // If a spreadsheet, for each letter (or sequence) in the exemplars, what is the corresponding Latin letter (or sequence). 172 // More sophisticated users can do a better job, supplying a file of rules like transforms/Arabic-Latin-BGN.xml. 173 174 if (script.equals("Latn")) { 175 result.add(CoreItems.romanization); 176 } else { 177 boolean found = false; 178 Set<String> scriptNames = getScriptNames(script); 179 Set<String> tempErrors = new LinkedHashSet<>(); 180 for (String scriptName : scriptNames) { 181 for (String[] pair : ROMANIZATION_PATHS) { 182 String filename = pair[0] + scriptName + pair[1]; 183 if (hasFile(SpecialDir.transforms, filename)) { 184 result.add(CoreItems.romanization); 185 found = true; 186 break; 187 } else { 188 tempErrors.add(script); // debugging 189 } 190 } 191 } 192 if (!found) { 193 detailedErrors.put(CoreItems.romanization, "//supplementalData/transforms/transform" 194 + "[@source=\"und-" + script + "\"]" 195 + "[@target=\"und-Latn\"]" 196 //+ "[@direction=\"forward\"]" 197 ); 198 } 199 } 200 201 // (N) Casing information (cased scripts only, according to ScriptMetadata.txt) 202 // This will be in common/casing 203 Info scriptData = ScriptMetadata.getInfo(script); 204 if (scriptData.hasCase == Trinary.YES) { 205 if (hasFile(SpecialDir.casing, baseLanguage)) { 206 result.add(CoreItems.casing); 207 } else { 208 detailedErrors.put(CoreItems.casing, "//ldml/metadata/casingData/"); 209 } 210 } else { 211 result.add(CoreItems.casing); 212 } 213 // (N) Collation rules [non-Survey Tool] 214 // For details, see cldr-spec/collation-guidelines. 215 // The result will be a file like: common/collation/ar.xml or common/collation/da.xml. 216 // Note that the "search" collators (which tend to be large) are not needed initially. 217 218 // check for file cldr/collation/<language>.xml 219 if (hasFile(SpecialDir.collation, baseLanguage)) { 220 result.add(CoreItems.collation); 221 } else { 222 detailedErrors.put(CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]"); 223 } 224 return Collections.unmodifiableSet(result); 225 } 226 227 private static final String[][] ROMANIZATION_PATHS = { 228 { "", "-Latin" }, 229 { "", "-Latin-BGN" }, 230 { "Latin-", "" }, 231 }; 232 233 private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class); 234 static { 235 SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab")); 236 SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han")); 237 SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han")); 238 SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han")); 239 SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul")); SCRIPT_NAMES.freeze()240 SCRIPT_NAMES.freeze(); 241 } 242 getScriptNames(String script)243 private static Set<String> getScriptNames(String script) { 244 Set<String> result = SCRIPT_NAMES.get(script); 245 if (result != null) { 246 return result; 247 } 248 result = new HashSet(); 249 String name = UScript.getName(UScript.getCodeFromName(script)); 250 result.add(name); 251 result.add(script); 252 return result; 253 } 254 255 private enum SpecialDir { 256 transforms, collation, casing 257 } 258 259 private static final Relation<SpecialDir, String> SPECIAL_FILES = Relation.of(new EnumMap(SpecialDir.class), HashSet.class); 260 static { 261 for (SpecialDir dir : SpecialDir.values()) { 262 File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir); 263 for (String s : realDir.list()) { 264 if (s.endsWith(".xml")) { 265 s = s.substring(0, s.length() - 4); 266 } SPECIAL_FILES.put(dir, s)267 SPECIAL_FILES.put(dir, s); 268 } 269 } 270 } 271 hasFile(SpecialDir type, String filename)272 private static boolean hasFile(SpecialDir type, String filename) { 273 return SPECIAL_FILES.get(type).contains(filename); 274 } 275 isRtl(UnicodeSet main)276 public static boolean isRtl(UnicodeSet main) { 277 for (UnicodeSetIterator it = new UnicodeSetIterator(main); it.nextRange();) { 278 for (int i = it.codepoint; i <= it.codepointEnd; ++i) { 279 int bidiClass = UCharacter.getDirection(i); 280 switch (bidiClass) { 281 case UCharacterDirection.RIGHT_TO_LEFT: 282 case UCharacterDirection.RIGHT_TO_LEFT_ARABIC: 283 return true; 284 case UCharacterDirection.LEFT_TO_RIGHT: 285 return false; 286 } 287 } 288 } 289 return false; 290 } 291 292 } 293