• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.util;
2 
3 import java.io.File;
4 import java.util.Arrays;
5 import java.util.Collections;
6 import java.util.EnumMap;
7 import java.util.EnumSet;
8 import java.util.HashMap;
9 import java.util.HashSet;
10 import java.util.LinkedHashSet;
11 import java.util.Set;
12 
13 import org.unicode.cldr.draft.ScriptMetadata;
14 import org.unicode.cldr.draft.ScriptMetadata.Info;
15 import org.unicode.cldr.draft.ScriptMetadata.Trinary;
16 import org.unicode.cldr.tool.LikelySubtags;
17 import org.unicode.cldr.util.SupplementalDataInfo.PluralType;
18 
19 import com.google.common.collect.ImmutableSet;
20 import com.google.common.collect.Multimap;
21 import com.ibm.icu.impl.Relation;
22 import com.ibm.icu.lang.UCharacter;
23 import com.ibm.icu.lang.UCharacterDirection;
24 import com.ibm.icu.lang.UScript;
25 import com.ibm.icu.text.UnicodeSet;
26 import com.ibm.icu.text.UnicodeSetIterator;
27 
28 public class CoreCoverageInfo {
29 
30     private static final CLDRConfig config = CLDRConfig.getInstance();
31     private static final String CLDR_BASE_DIRECTORY = config.getCldrBaseDirectory().toString();
32     private static final SupplementalDataInfo sdi = SupplementalDataInfo.getInstance();
33     private static final LikelySubtags ls = new LikelySubtags(sdi);
34 
35     public enum CoreItems {
36         // Drop the exemplars, since
37         // main_exemplar, auxiliary_exemplar, // numbers_exemplar, punctuation_exemplar, index_exemplar(Level.MODERN)
38         orientation,
39         plurals,
40         default_content, likely_subtags,
41         country_data,
42         casing,
43         collation,
44         romanization(Level.MODERATE),
45         ordinals(Level.MODERN),
46         ;
47 
48         public static Set<CoreItems> ONLY_RECOMMENDED = ImmutableSet.copyOf(
49             EnumSet.of(romanization, ordinals));
50 
51 //        private static final Set<CoreItems> EXEMPLARS = ImmutableSet.copyOf(EnumSet.of(
52 //            main_exemplar, auxiliary_exemplar
53 //            //, numbers_exemplar, punctuation_exemplar, index_exemplar
54 //            ));
55 
56         public static final int COUNT = CoreItems.values().length;
57         public final Level desiredLevel;
58 
CoreItems(Level desiredLevel)59         CoreItems(Level desiredLevel) {
60             this.desiredLevel = desiredLevel;
61         }
CoreItems()62         CoreItems() {
63             this(Level.CORE);
64         }
65         @Override
toString()66         public String toString() {
67             // TODO Auto-generated method stub
68             return name() + (desiredLevel == Level.CORE ? "" : "*");
69         }
70     }
71 
getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors)72     public static Set<CoreItems> getCoreCoverageInfo(CLDRFile file, Multimap<CoreItems,String> detailedErrors) {
73         if (file.isResolved()) {
74             file = file.getUnresolved();
75         }
76         String locale = file.getLocaleID();
77         LanguageTagParser ltp = new LanguageTagParser();
78         locale = ltp.set(locale).getLanguageScript();
79         String baseLanguage = ltp.getLanguage();
80         String script = ltp.getScript();
81 
82         Set<CoreItems> result = EnumSet.noneOf(CoreItems.class);
83 
84         //      (04) Exemplar sets: main, auxiliary, index, punctuation. [main/xxx.xml]
85         //      These must reflect the Unicode model. For more information, see tr35-general.html#Character_Elements.
86         boolean isRtl = false;
87 //        for (CoreItems exemplar : CoreItems.EXEMPLARS) {
88 //            String type = exemplar.toString();
89 //            type = type.substring(0, type.indexOf('_'));
90 //
91 //            String path = "//ldml/characters/exemplarCharacters";
92 //            boolean isMain = type.equals("main");
93 //            if (!isMain) {
94 //                path += "[@type=\"" + type + "\"]";
95 //            }
96 //            String value = file.getStringValue(path);
97 //            if (value != null) {
98 //                String sourceLocale = file.getSourceLocaleID(path, null);
99 //                if (locale.equals(sourceLocale)) {
100 //                    result.add(exemplar);
101 //                }
102 //            } else {
103 //                detailedErrors.put(exemplar, path);
104 //            }
105 //            if (isMain && result.contains(exemplar)) {
106 //                UnicodeSet main = new UnicodeSet(value);
107 //                isRtl = isRtl(main);
108 //            }
109 //        }
110         //      (02) Orientation (bidi writing systems only) [main/xxx.xml]
111         String path = "//ldml/layout/orientation/characterOrder";
112         String value = file.getStringValue(path);
113         if ("right-to-left".equals(value) == isRtl) {
114             result.add(CoreItems.orientation);
115         } else {
116             detailedErrors.put(CoreItems.orientation, path);
117         }
118 
119         //      (01) Plural rules [supplemental/plurals.xml and ordinals.xml]
120         //      For more information, see cldr-spec/plural-rules.
121         if (sdi.getPluralLocales(PluralType.cardinal).contains(baseLanguage)) {
122             result.add(CoreItems.plurals);
123         } else {
124             detailedErrors.put(CoreItems.plurals, "//supplementalData/plurals[@type=\"cardinal\"]");
125         }
126         if (sdi.getPluralLocales(PluralType.ordinal).contains(baseLanguage)) {
127             result.add(CoreItems.ordinals);
128         } else {
129             detailedErrors.put(CoreItems.ordinals, "//supplementalData/plurals[@type=\"ordinal\"]");
130         }
131 
132         //      (01) Default content script and region (normally: normally country with largest population using that language, and normal script for that).  [supplemental/supplementalMetadata.xml]
133 
134         String defaultContent = sdi.getDefaultContentLocale(locale);
135         if (defaultContent != null) {
136             result.add(CoreItems.default_content);
137         } else {
138             detailedErrors.put(CoreItems.default_content, "//supplementalData/supplementalMetadata/defaultContent");
139         }
140         // likely subtags
141         String max = ls.maximize(locale);
142         String maxLangScript = null;
143         if (max != null) {
144             ltp.set(max);
145             maxLangScript = ltp.getLanguageScript();
146             script = ltp.getScript();
147             if (!script.isEmpty() && !ltp.getRegion().isEmpty()) {
148                 result.add(CoreItems.likely_subtags);
149             }
150         }
151         if (!result.contains(CoreItems.likely_subtags)) {
152             detailedErrors.put(CoreItems.likely_subtags, "//supplementalData/likelySubtags");
153         }
154         //      (N) Verify the country data ( i.e. which territories in which the language is spoken enough to create a locale ) [supplemental/supplementalData.xml]
155         // we verify that there is at least one region
156         // we try 3 cases: language, locale, maxLangScript
157         Set<String> territories = sdi.getTerritoriesForPopulationData(locale);
158         if (territories == null) {
159             territories = sdi.getTerritoriesForPopulationData(baseLanguage);
160         }
161         if (territories == null && maxLangScript != null) {
162             territories = sdi.getTerritoriesForPopulationData(maxLangScript);
163         }
164         if (territories != null && territories.size() != 0) {
165             result.add(CoreItems.country_data);
166         } else {
167             detailedErrors.put(CoreItems.country_data, "//supplementalData/territoryInfo");
168             sdi.getTerritoriesForPopulationData(locale); // for debugging
169         }
170         //      *(N) Romanization table (non-Latin writing systems only) [spreadsheet, we'll translate into transforms/xxx-en.xml]
171         //      If a spreadsheet, for each letter (or sequence) in the exemplars, what is the corresponding Latin letter (or sequence).
172         //      More sophisticated users can do a better job, supplying a file of rules like transforms/Arabic-Latin-BGN.xml.
173 
174         if (script.equals("Latn")) {
175             result.add(CoreItems.romanization);
176         } else {
177             boolean found = false;
178             Set<String> scriptNames = getScriptNames(script);
179             Set<String> tempErrors = new LinkedHashSet<>();
180             for (String scriptName : scriptNames) {
181                 for (String[] pair : ROMANIZATION_PATHS) {
182                     String filename = pair[0] + scriptName + pair[1];
183                     if (hasFile(SpecialDir.transforms, filename)) {
184                         result.add(CoreItems.romanization);
185                         found = true;
186                         break;
187                     } else {
188                         tempErrors.add(script); // debugging
189                     }
190                 }
191             }
192             if (!found) {
193                 detailedErrors.put(CoreItems.romanization, "//supplementalData/transforms/transform"
194                     + "[@source=\"und-" + script + "\"]"
195                     + "[@target=\"und-Latn\"]"
196                     //+ "[@direction=\"forward\"]"
197                     );
198             }
199         }
200 
201         //      (N) Casing information (cased scripts only, according to ScriptMetadata.txt)
202         //      This will be in common/casing
203         Info scriptData = ScriptMetadata.getInfo(script);
204         if (scriptData.hasCase == Trinary.YES) {
205             if (hasFile(SpecialDir.casing, baseLanguage)) {
206                 result.add(CoreItems.casing);
207             } else {
208                 detailedErrors.put(CoreItems.casing, "//ldml/metadata/casingData/");
209             }
210         } else {
211             result.add(CoreItems.casing);
212         }
213         //      (N) Collation rules [non-Survey Tool]
214         //      For details, see cldr-spec/collation-guidelines.
215         //      The result will be a file like: common/collation/ar.xml or common/collation/da.xml.
216         //      Note that the "search" collators (which tend to be large) are not needed initially.
217 
218         // check for file cldr/collation/<language>.xml
219         if (hasFile(SpecialDir.collation, baseLanguage)) {
220             result.add(CoreItems.collation);
221         } else {
222             detailedErrors.put(CoreItems.collation, "//ldml/collations/collation[@type=\"standard\"]");
223         }
224         return Collections.unmodifiableSet(result);
225     }
226 
227     private static final String[][] ROMANIZATION_PATHS = {
228         { "", "-Latin" },
229         { "", "-Latin-BGN" },
230         { "Latin-", "" },
231     };
232 
233     private static final Relation SCRIPT_NAMES = Relation.of(new HashMap(), HashSet.class);
234     static {
235         SCRIPT_NAMES.putAll("Arab", Arrays.asList("Arabic", "Arab"));
236         SCRIPT_NAMES.putAll("Jpan", Arrays.asList("Jpan", "Han"));
237         SCRIPT_NAMES.putAll("Hant", Arrays.asList("Hant", "Han"));
238         SCRIPT_NAMES.putAll("Hans", Arrays.asList("Hans", "Han"));
239         SCRIPT_NAMES.putAll("Kore", Arrays.asList("Hang", "Hangul"));
SCRIPT_NAMES.freeze()240         SCRIPT_NAMES.freeze();
241     }
242 
getScriptNames(String script)243     private static Set<String> getScriptNames(String script) {
244         Set<String> result = SCRIPT_NAMES.get(script);
245         if (result != null) {
246             return result;
247         }
248         result = new HashSet();
249         String name = UScript.getName(UScript.getCodeFromName(script));
250         result.add(name);
251         result.add(script);
252         return result;
253     }
254 
255     private enum SpecialDir {
256         transforms, collation, casing
257     }
258 
259     private static final Relation<SpecialDir, String> SPECIAL_FILES = Relation.of(new EnumMap(SpecialDir.class), HashSet.class);
260     static {
261         for (SpecialDir dir : SpecialDir.values()) {
262             File realDir = new File(CLDR_BASE_DIRECTORY + "/common/" + dir);
263             for (String s : realDir.list()) {
264                 if (s.endsWith(".xml")) {
265                     s = s.substring(0, s.length() - 4);
266                 }
SPECIAL_FILES.put(dir, s)267                 SPECIAL_FILES.put(dir, s);
268             }
269         }
270     }
271 
hasFile(SpecialDir type, String filename)272     private static boolean hasFile(SpecialDir type, String filename) {
273         return SPECIAL_FILES.get(type).contains(filename);
274     }
275 
isRtl(UnicodeSet main)276     public static boolean isRtl(UnicodeSet main) {
277         for (UnicodeSetIterator it = new UnicodeSetIterator(main); it.nextRange();) {
278             for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
279                 int bidiClass = UCharacter.getDirection(i);
280                 switch (bidiClass) {
281                 case UCharacterDirection.RIGHT_TO_LEFT:
282                 case UCharacterDirection.RIGHT_TO_LEFT_ARABIC:
283                     return true;
284                 case UCharacterDirection.LEFT_TO_RIGHT:
285                     return false;
286                 }
287             }
288         }
289         return false;
290     }
291 
292 }
293