• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import java.io.File;
4 import java.util.Arrays;
5 import java.util.Collections;
6 import java.util.EnumMap;
7 import java.util.EnumSet;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.LinkedHashSet;
11 import java.util.Set;
12 
13 import org.unicode.cldr.draft.ScriptMetadata;
14 import org.unicode.cldr.draft.ScriptMetadata.Info;
15 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
16 import org.unicode.cldr.tool.LikelySubtags;
17 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
18 
19 import com.google.common.collect.ImmutableSet;
20 import com.google.common.collect.Multimap;
21 import com.ibm.icu.impl.Relation;
22 import com.ibm.icu.lang.UCharacter;
23 import com.ibm.icu.lang.UCharacterDirection;
24 import com.ibm.icu.lang.UScript;
25 import com.ibm.icu.text.UnicodeSet;
26 import com.ibm.icu.text.UnicodeSetIterator;
27 
28 public class CoreCoverageInfo {
29 
30     private static final CLDRConfig config = CLDRConfig.getInstance();
31     private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString();
32     private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
33     private static final LikelySubtags ls = new LikelySubtags();
34 
35     public enum CoreItems {
36         // Drop the exemplars, since
37         // main_exemplar, auxiliary_exemplar, // numbers_exemplar, punctuation_exemplar, index_exemplar(Level.MODERN)
38         orientation,
39         plurals,
40         default_content, likely_subtags,
41         country_data,
42         casing,
43         collation,
44         romanization(Level.MODERATE),
45         ordinals(Level.MODERN),
46         ;
47 
48         public static Set<CoreItems> ONLY_RECOMMENDED = ImmutableSet.copyOf(
49             EnumSet.of(romanization, ordinals));
50 
51 //        private static final Set<CoreItems> EXEMPLARS = ImmutableSet.copyOf(EnumSet.of(
52 //            main_exemplar, auxiliary_exemplar
53 //            //, numbers_exemplar, punctuation_exemplar, index_exemplar
54 //            ));
55 
56         public static final int COUNT = CoreItems.values().length;
57         public final Level desiredLevel;
58 
CoreItems(Level desiredLevel)59         CoreItems(Level desiredLevel) {
60             this.desiredLevel = desiredLevel;
61         }
CoreItems()62         CoreItems() {
63             this(Level.CORE);
64         }
65         @Override
toString()66         public String toString() {
67             // TODO Auto-generated method stub
68             return name() + (desiredLevel == Level.CORE ? "" : "*");
69         }
70     }
71 
getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors)72     public static Set<CoreItems> getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors) {
73         if (file.isResolved()) {
74             file = file.getUnresolved();
75         }
76         String locale = file.getLocaleID();
77         LanguageTagParser ltp = new LanguageTagParser();
78         locale = ltp.set(locale).getLanguageScript();
79         String baseLanguage = ltp.getLanguage();
80         String script = ltp.getScript();
81 
82         Set<CoreItems> result = EnumSet.noneOf(CoreItems.class);
83 
84         //      (04) Exemplar sets: main, auxiliary, index, punctuation. [main/xxx.xml]
85         //      These must reflect the Unicode model. For more information, see tr35-general.html#Character_Elements.
86         boolean isRtl = false;
87 //        for (CoreItems exemplar : CoreItems.EXEMPLARS) {
88 //            String type = exemplar.toString();
89 //            type = type.substring(0, type.indexOf('_'));
90 //
91 //            String path = "//ldml/characters/exemplarCharacters";
92 //            boolean isMain = type.equals("main");
93 //            if (!isMain) {
94 //                path += "[@type=\"" + type + "\"]";
95 //            }
96 //            String value = file.getStringValue(path);
97 //            if (value != null) {
98 //                String sourceLocale = file.getSourceLocaleID(path, null);
99 //                if (locale.equals(sourceLocale)) {
100 //                    result.add(exemplar);
101 //                }
102 //            } else {
103 //                detailedErrors.put(exemplar, path);
104 //            }
105 //            if (isMain && result.contains(exemplar)) {
106 //                UnicodeSet main = new UnicodeSet(value);
107 //                isRtl = isRtl(main);
108 //            }
109 //        }
110         //      (02) Orientation (bidi writing systems only) [main/xxx.xml]
111         String path = "//ldml/layout/orientation/characterOrder";
112         String value = file.getStringValue(path);
113         if ("right-to-left".equals(value) == isRtl) {
114             result.add(CoreItems.orientation);
115         } else {
116             detailedErrors.put(CoreItems.orientation, path);
117         }
118 
119         //      (01) Plural rules [supplemental/plurals.xml and ordinals.xml]
120         //      For more information, see cldr-spec/plural-rules.
121         if (sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) {
122             result.add(CoreItems.plurals);
123         } else {
124             detailedErrors.put(CoreItems.plurals, "//supplementalData/plurals[@type=\"cardinal\"]/pluralRules[@locales=\"" + locale
125                 + "\"]/pluralRule[@count=\"other\"]");
126         }
127         if (sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) {
128             result.add(CoreItems.ordinals);
129         } else {
130             detailedErrors.put(CoreItems.ordinals, "//supplementalData/plurals[@type=\"ordinal\"]/pluralRules[@locales=\"" + locale
131                 + "\"]/pluralRule[@count=\"other\"]");
132         }
133 
134         //      (01) Default content script and region (normally: normally country with largest population using that language, and normal script for that).  [supplemental/supplementalMetadata.xml]
135 
136         String defaultContent = sdi.getDefaultContentLocale(locale);
137         if (defaultContent != null) {
138             result.add(CoreItems.default_content);
139         } else {
140             detailedErrors.put(CoreItems.default_content, "//supplementalData/supplementalMetadata/defaultContent");
141         }
142         // likely subtags
143         String max = ls.maximize(locale);
144         String maxLangScript = null;
145         if (max != null) {
146             ltp.set(max);
147             maxLangScript = ltp.getLanguageScript();
148             script = ltp.getScript();
149             if (!script.isEmpty() && !ltp.getRegion().isEmpty()) {
150                 result.add(CoreItems.likely_subtags);
151             }
152         }
153         if (!result.contains(CoreItems.likely_subtags)) {
154             detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags");
155         }
156         //      (N) Verify the country data ( i.e. which territories in which the language is spoken enough to create a locale ) [supplemental/supplementalData.xml]
157         // we verify that there is at least one region
158         // we try 3 cases: language, locale, maxLangScript
159         Set<String> territories = sdi.getTerritoriesForPopulationData(locale);
160         if (territories == null) {
161             territories = sdi.getTerritoriesForPopulationData(baseLanguage);
162         }
163         if (territories == null && maxLangScript != null) {
164             territories = sdi.getTerritoriesForPopulationData(maxLangScript);
165         }
166         if (territories != null && territories.size() != 0) {
167             result.add(CoreItems.country_data);
168         } else {
169             detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo");
170             sdi.getTerritoriesForPopulationData(locale); // for debugging
171         }
172         //      *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll translate into transforms/xxx-en.xml]
173         //      If a spreadsheet, for each letter (or sequence) in the exemplars, what is the corresponding Latin letter (or sequence).
174         //      More sophisticated users can do a better job, supplying a file of rules like transforms/Arabic-Latin-BGN.xml.
175 
176         if (script.equals("Latn")) {
177             result.add(CoreItems.romanization);
178         } else {
179             boolean found = false;
180             Set<String> scriptNames = getScriptNames(script);
181             Set<String> tempErrors = new LinkedHashSet<>();
182             for (String scriptName : scriptNames) {
183                 for (String[] pair : ROMANIZATION_PATHS) {
184                     String filename = pair[0] + scriptName + pair[1];
185                     if (hasFile(SpecialDir.transforms, filename)) {
186                         result.add(CoreItems.romanization);
187                         found = true;
188                         break;
189                     } else {
190                         tempErrors.add(script); // debugging
191                     }
192                 }
193             }
194             if (!found) {
195                 detailedErrors.put(CoreItems.romanization, "//supplementalData/transforms/transform"
196                     + "[@source=\"und-" + script + "\"]"
197                     + "[@target=\"und-Latn\"]"
198                     //+ "[@direction=\"forward\"]"
199                     );
200             }
201         }
202 
203         //      (N) Casing information (cased scripts only, according to ScriptMetadata.txt)
204         //      This will be in common/casing
205         Info scriptData = ScriptMetadata.getInfo(script);
206         if (scriptData.hasCase == Trinary.YES) {
207             if (hasFile(SpecialDir.casing, baseLanguage)) {
208                 result.add(CoreItems.casing);
209             } else {
210                 detailedErrors.put(CoreItems.casing, "//ldml/metadata/casingData/casingItem[@type=\"*\"]");
211             }
212         } else {
213             result.add(CoreItems.casing);
214         }
215         //      (N) Collation rules [non-Survey Tool]
216         //      For details, see cldr-spec/collation-guidelines.
217         //      The result will be a file like: common/collation/ar.xml or common/collation/da.xml.
218         //      Note that the "search" collators (which tend to be large) are not needed initially.
219 
220         // check for file cldr/collation/<language>.xml
221         if (hasFile(SpecialDir.collation, baseLanguage)) {
222             result.add(CoreItems.collation);
223         } else {
224             detailedErrors.put(CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]");
225         }
226         return Collections.unmodifiableSet(result);
227     }
228 
229     private static final String[][] ROMANIZATION_PATHS = {
230         { "", "-Latin" },
231         { "", "-Latin-BGN" },
232         { "Latin-", "" },
233     };
234 
235     private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class);
236     static {
237         SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab"));
238         SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han"));
239         SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han"));
240         SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han"));
241         SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul"));
SCRIPT_NAMES.freeze()242         SCRIPT_NAMES.freeze();
243     }
244 
getScriptNames(String script)245     private static Set<String> getScriptNames(String script) {
246         Set<String> result = SCRIPT_NAMES.get(script);
247         if (result != null) {
248             return result;
249         }
250         result = new HashSet();
251         String name = UScript.getName(UScript.getCodeFromName(script));
252         result.add(name);
253         result.add(script);
254         return result;
255     }
256 
257     private enum SpecialDir {
258         transforms, collation, casing
259     }
260 
261     private static final Relation<SpecialDir, String> SPECIAL_FILES = Relation.of(new EnumMap(SpecialDir.class), HashSet.class);
262     static {
263         for (SpecialDir dir : SpecialDir.values()) {
264             File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir);
265             for (String s : realDir.list()) {
266                 if (s.endsWith(".xml")) {
267                     s = s.substring(0, s.length() - 4);
268                 }
SPECIAL_FILES.put(dir, s)269                 SPECIAL_FILES.put(dir, s);
270             }
271         }
272     }
273 
hasFile(SpecialDir type, String filename)274     private static boolean hasFile(SpecialDir type, String filename) {
275         return SPECIAL_FILES.get(type).contains(filename);
276     }
277 
isRtl(UnicodeSet main)278     public static boolean isRtl(UnicodeSet main) {
279         for (UnicodeSetIterator it = new UnicodeSetIterator(main); it.nextRange();) {
280             for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
281                 int bidiClass = UCharacter.getDirection(i);
282                 switch (bidiClass) {
283                 case UCharacterDirection.RIGHT_TO_LEFT:
284                 case UCharacterDirection.RIGHT_TO_LEFT_ARABIC:
285                     return true;
286                 case UCharacterDirection.LEFT_TO_RIGHT:
287                     return false;
288                 }
289             }
290         }
291         return false;
292     }
293 
294 }
295