• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.json;
2 
3 import java.util.HashSet;
4 import java.util.Set;
5 import java.util.regex.Pattern;
6 
7 import org.unicode.cldr.util.Builder;
8 import org.unicode.cldr.util.CLDRFile;
9 import org.unicode.cldr.util.PatternCache;
10 
11 import com.google.common.collect.ImmutableSet;
12 
13 class LdmlConvertRules {
14 
15     /** File sets that will not be processed in JSON transformation. */
16     public static final ImmutableSet<String> IGNORE_FILE_SET = ImmutableSet.of("attributeValueValidity", "coverageLevels", "grammaticalFeatures", "postalCodeData",
17         "subdivisions", "units");
18 
19     /**
20      * The attribute list that should become part of the name in form of
21      * name-(attribute)-(value).
22      * [parent_element]:[element]:[attribute]
23      */
24     // common/main
25     static final ImmutableSet<String> NAME_PART_DISTINGUISHING_ATTR_SET = ImmutableSet.of(
26         "monthWidth:month:yeartype",
27         "characters:parseLenients:scope",
28         "dateFormat:pattern:numbers",
29         "characterLabelPatterns:characterLabelPattern:count", // originally under characterLabels
30         "currencyFormats:unitPattern:count",
31         "currency:displayName:count",
32         "numbers:symbols:numberSystem",
33         "numbers:decimalFormats:numberSystem",
34         "numbers:currencyFormats:numberSystem",
35         "numbers:percentFormats:numberSystem",
36         "numbers:scientificFormats:numberSystem",
37         "numbers:miscPatterns:numberSystem",
38         "minimalPairs:pluralMinimalPairs:count",
39         "territoryContainment:group:status",
40         "decimalFormat:pattern:count",
41         "currencyFormat:pattern:count",
42         "unit:unitPattern:count",
43         // compound units
44         "compoundUnit:compoundUnitPattern1:count",
45         "compoundUnit:compoundUnitPattern1:gender",
46         "compoundUnit:compoundUnitPattern1:case",
47         "field:relative:type",
48         "field:relativeTime:type",
49         "relativeTime:relativeTimePattern:count",
50         "availableFormats:dateFormatItem:count",
51         "listPatterns:listPattern:type",
52         "timeZoneNames:regionFormat:type",
53         "units:durationUnit:type",
54         "weekData:minDays:territories",
55         "weekData:firstDay:territories",
56         "weekData:weekendStart:territories",
57         "weekData:weekendEnd:territories",
58         "unitPreferenceDataData:unitPreferences:category",
59         "measurementData:measurementSystem:category",
60         "supplemental:plurals:type",
61         "pluralRanges:pluralRange:start",
62         "pluralRanges:pluralRange:end",
63         "pluralRules:pluralRule:count",
64         "languageMatches:languageMatch:desired",
65         "styleNames:styleName:subtype",
66         "styleNames:styleName:alt");
67 
68     /**
69      * The set of attributes that should become part of the name in form of
70      * name-(attribute)-(value).
71      */
72 
73     /**
74      * Following is a list of element:attribute pair. These attributes should be
75      * treated as values. For example,
76      * <type type="arab" key="numbers">Arabic-Indic Digits</type>
77      * should be really converted as,
78      * "arab": {
79      * "_value": "Arabic-Indic Digits",
80      * "_key": "numbers"
81      * }
82      */
83     static final ImmutableSet<String> ATTR_AS_VALUE_SET = ImmutableSet.of(
84 
85         // in common/supplemental/dayPeriods.xml
86         "dayPeriodRules:dayPeriodRule:from",
87 
88         // in common/supplemental/likelySubtags.xml
89         "likelySubtags:likelySubtag:to",
90 
91         // in common/supplemental/metaZones.xml
92         "timezone:usesMetazone:mzone",
93         // Only the current usesMetazone will be kept, it is not necessary to keep
94         // "to" and "from" attributes to make key unique. This is needed as their
95         // value is not good if used as key.
96         "timezone:usesMetazone:to",
97         "timezone:usesMetazone:from",
98 
99         "mapTimezones:mapZone:other",
100         "mapTimezones:mapZone:type",
101         "mapTimezones:mapZone:territory",
102 
103         // in common/supplemental/numberingSystems.xml
104         "numberingSystems:numberingSystem:type",
105 
106         // in common/supplemental/supplementalData.xml
107         "region:currency:from",
108         "region:currency:to",
109         "region:currency:tender",
110         "calendar:calendarSystem:type",
111         "codeMappings:territoryCodes:numeric",
112         "codeMappings:territoryCodes:alpha3",
113         "codeMappings:currencyCodes:numeric",
114         "timeData:hours:allowed",
115         "timeData:hours:preferred",
116         // common/supplemental/supplementalMetaData.xml
117         "validity:variable:type",
118         "deprecated:deprecatedItems:elements",
119         "deprecated:deprecatedItems:attributes",
120         "deprecated:deprecatedItems:type",
121 
122         // in common/supplemental/telephoneCodeData.xml
123         "codesByTerritory:telephoneCountryCode:code",
124 
125         // in common/supplemental/windowsZones.xml
126         "mapTimezones:mapZone:other",
127 
128         // in common/bcp47/*.xml
129         "keyword:key:alias",
130         "key:type:alias",
131         "key:type:name",
132 
133         // identity elements
134         "identity:language:type",
135         "identity:script:type",
136         "identity:territory:type",
137         "identity:variant:type");
138 
139     /**
140      * The set of element:attribute pair in which the attribute should be
141      * treated as value. All the attribute here are non-distinguishing attributes.
142      */
143 
144     /**
145      * For those attributes that are treated as values, they taken the form of
146      * element_name: { ..., attribute: value, ...}
147      * This is desirable as an element may have several attributes that are
148      * treated as values. But in some cases, there is one such attribute only,
149      * and it is more desirable to convert
150      * element_name: { attribute: value}
151      * to
152      * element_name: value
153      * With a solid example,
154      * <likelySubtag from="zh" to="zh_Hans_CN" />
155      * distinguishing attr "from" will become the key, its better to
156      * omit "to" and have this simple mapping:
157      * "zh" : "zh_Hans_CN",
158      */
159     static final ImmutableSet<String> COMPACTABLE_ATTR_AS_VALUE_SET = ImmutableSet.of(
160         // common/main
161         "calendars:default:choice",
162         "dateFormats:default:choice",
163         "months:default:choice",
164         "monthContext:default:choice",
165         "days:default:choice",
166         "dayContext:default:choice",
167         "timeFormats:default:choice",
168         "dateTimeFormats:default:choice",
169         "timeZoneNames:singleCountries:list",
170 
171         //rbnf
172         "ruleset:rbnfrule:value",
173         // common/supplemental
174         "likelySubtags:likelySubtag:to",
175         //"territoryContainment:group:type",
176         "calendar:calendarSystem:type",
177         "calendarPreferenceData:calendarPreference:ordering",
178         "codesByTerritory:telephoneCountryCode:code",
179 
180         // common/collation
181         "collations:default:choice",
182 
183         // common/supplemental/pluralRanges.xml
184         "pluralRanges:pluralRange:result",
185 
186         // identity elements
187         "identity:language:type",
188         "identity:script:type",
189         "identity:territory:type",
190         "identity:variant:type");
191 
192     /**
193      * The set of attributes that should be treated as value, and reduce to
194      * simple value only form.
195      */
196 
197     /**
198      * Anonymous key name.
199      */
200     public static final String ANONYMOUS_KEY = "_";
201 
202     /**
203      * Check if the attribute should be suppressed.
204      *
205      * Right now only "_q" is suppressed. In most cases array is used and there
206      * is no need for this information. In other cases, order is irrelevant.
207      *
208      * @return True if the attribute should be suppressed.
209      */
IsSuppresedAttr(String attr)210     public static boolean IsSuppresedAttr(String attr) {
211         return attr.endsWith("_q") || attr.endsWith("-q");
212     }
213 
214     /**
215      * The set of attributes that should be ignored in the conversion process.
216      */
217     public static final ImmutableSet<String> IGNORABLE_NONDISTINGUISHING_ATTR_SET = ImmutableSet.of("draft", "references");
218 
219     /**
220      * List of attributes that should be suppressed.
221      * This list comes form cldr/common/supplemental/supplementalMetadata. Each
222      * three of them is a group, they are for element, value and attribute.
223      * If the specified attribute appears in specified element with specified =
224      * value, it should be suppressed.
225      */
226     public static final String[] ATTR_SUPPRESS_LIST = {
227         // common/main
228         "dateFormat", "standard", "type",
229         "dateTimeFormat", "standard", "type",
230         "timeFormat", "standard", "type",
231         "decimalFormat", "standard", "type",
232         "percentFormat", "standard", "type",
233         "scientificFormat", "standard", "type",
234         "pattern", "standard", "type",
235     };
236 
237     /**
238      * This is a simple class to hold the splittable attribute specification.
239      */
240     public static class SplittableAttributeSpec {
241         public String element;
242         public String attribute;
243         public String attrAsValueAfterSplit;
244 
SplittableAttributeSpec(String el, String attr, String av)245         SplittableAttributeSpec(String el, String attr, String av) {
246             element = el;
247             attribute = attr;
248             attrAsValueAfterSplit = av;
249         }
250     }
251 
252     /**
253      * List of attributes that has value that can be split. Each two of them is a
254      * group, and represent element and value. Occurrences of such match should
255      * lead to creation of multiple node.
256      * Example:
257      * <weekendStart day="thu" territories="DZ KW OM SA SD YE AF IR"/>
258      * should be treated as if following node is encountered.
259      * <weekendStart day="thu" territories="DZ"/>
260      * <weekendStart day="thu" territories="KW"/>
261      * <weekendStart day="thu" territories="OM"/>
262      * <weekendStart day="thu" territories="SA"/>
263      * <weekendStart day="thu" territories="SD"/>
264      * <weekendStart day="thu" territories="YE"/>
265      * <weekendStart day="thu" territories="AF"/>
266      * <weekendStart day="thu" territories="IR"/>
267      */
268     public static final SplittableAttributeSpec[] SPLITTABLE_ATTRS = {
269         new SplittableAttributeSpec("calendarPreference", "territories", null),
270         new SplittableAttributeSpec("pluralRanges", "locales", null),
271         new SplittableAttributeSpec("pluralRules", "locales", null),
272         new SplittableAttributeSpec("minDays", "territories", "count"),
273         new SplittableAttributeSpec("firstDay", "territories", "day"),
274         new SplittableAttributeSpec("weekendStart", "territories", "day"),
275         new SplittableAttributeSpec("weekendEnd", "territories", "day"),
276         new SplittableAttributeSpec("measurementSystem", "territories", "type"),
277         new SplittableAttributeSpec("measurementSystem-category-temperature", "territories", "type"),
278         new SplittableAttributeSpec("paperSize", "territories", "type"),
279         new SplittableAttributeSpec("parentLocale", "locales", "parent"),
280         new SplittableAttributeSpec("hours", "regions", null),
281         new SplittableAttributeSpec("dayPeriodRules", "locales", null),
282         // new SplittableAttributeSpec("group", "contains", "group"),
283         new SplittableAttributeSpec("personList", "locales", "type"),
284         new SplittableAttributeSpec("unitPreference", "regions", null)
285     };
286 
287     /**
288      * The set that contains all timezone type of elements.
289      */
290     public static final Set<String> TIMEZONE_ELEMENT_NAME_SET = Builder.with(new HashSet<String>())
291         .add("zone").add("timezone")
292         .add("zoneItem").add("typeMap").freeze();
293 
294     /**
295      * There are a handful of attribute values that are more properly represented as an array of strings rather than
296      * as a single string.
297      */
298     public static final Set<String> ATTRVALUE_AS_ARRAY_SET = Builder.with(new HashSet<String>())
299         .add("territories").add("scripts").add("contains").freeze();
300 
301     /**
302      * Following is the list of elements that need to be sorted before output.
303      *
304      * Time zone item is split to multiple level, and each level should be
305      * grouped together. The locale list in "dayPeriodRule" could be split to
306      * multiple items, and items for each locale should be grouped together.
307      */
308     public static final String[] ELEMENT_NEED_SORT = {
309         "zone", "timezone", "zoneItem", "typeMap", "dayPeriodRule", "pluralRanges",
310         "pluralRules", "personList", "calendarPreferenceData", "character-fallback", "types", "timeData", "minDays",
311         "firstDay", "weekendStart", "weekendEnd", "measurementData", "measurementSystem"
312     };
313 
314     /**
315      * Some elements in CLDR has multiple children of the same type of element.
316      * We would like to treat them as array.
317      */
318     public static final Pattern ARRAY_ITEM_PATTERN = PatternCache.get(
319         "(.*/collation[^/]*/rules[^/]*/" +
320             "|.*/character-fallback[^/]*/character[^/]*/" +
321             "|.*/rbnfrule[^/]*/" +
322             "|.*/ruleset[^/]*/" +
323             "|.*/languageMatching[^/]*/languageMatches[^/]*/" +
324             "|.*/windowsZones[^/]*/mapTimezones[^/]*/" +
325             "|.*/metaZones[^/]*/mapTimezones[^/]*/" +
326             "|.*/segmentation[^/]*/variables[^/]*/" +
327             "|.*/segmentation[^/]*/suppressions[^/]*/" +
328             "|.*/transform[^/]*/tRules[^/]*/" +
329             "|.*/region/region[^/]*/" +
330             "|.*/keyword[^/]*/key[^/]*/" +
331             "|.*/telephoneCodeData[^/]*/codesByTerritory[^/]*/" +
332             "|.*/metazoneInfo[^/]*/timezone\\[[^\\]]*\\]/" +
333             "|.*/metadata[^/]*/validity[^/]*/" +
334             "|.*/metadata[^/]*/suppress[^/]*/" +
335             "|.*/metadata[^/]*/deprecated[^/]*/" +
336             ")(.*)");
337 
338     /**
339      * Number elements without a numbering system are there only for compatibility purposes.
340      * We automatically suppress generation of JSON objects for them.
341      */
342     public static final Pattern NO_NUMBERING_SYSTEM_PATTERN = Pattern
343         .compile("//ldml/numbers/(symbols|(decimal|percent|scientific|currency)Formats)/.*");
344     public static final Pattern NUMBERING_SYSTEM_PATTERN = Pattern
345         .compile("//ldml/numbers/(symbols|miscPatterns|(decimal|percent|scientific|currency)Formats)\\[@numberSystem=\"([^\"]++)\"\\]/.*");
346     public static final String[] ACTIVE_NUMBERING_SYSTEM_XPATHS = {
347         "//ldml/numbers/defaultNumberingSystem",
348         "//ldml/numbers/otherNumberingSystems/native",
349         "//ldml/numbers/otherNumberingSystems/traditional",
350         "//ldml/numbers/otherNumberingSystems/finance"
351     };
352 
353     /**
354      * Root language id pattern should be discarded in all locales except root,
355      * even though the path will exist in a resolved CLDRFile.
356      */
357     public static final Pattern ROOT_IDENTITY_PATTERN = Pattern
358         .compile("//ldml/identity/language\\[@type=\"root\"\\]");
359 
360     /**
361      * A simple class to hold the specification of a path transformation.
362      */
363     public static class PathTransformSpec {
364         public Pattern pattern;
365         public String replacement;
366 
PathTransformSpec(String patternStr, String replacement)367         PathTransformSpec(String patternStr, String replacement) {
368             pattern = PatternCache.get(patternStr);
369             this.replacement = replacement;
370         }
371     }
372 
373     /**
374      * Some special transformation, like add an additional layer, can be easily
375      * done by transforming the path. Following rules covers these kind of
376      * transformation.
377      * Note: It is important to keep the order for these rules. Whenever a
378      * rule matches, further rules won't be applied.
379      */
380     public static final PathTransformSpec PATH_TRANSFORMATIONS[] = {
381         // Add "standard" as type attribute to exemplarCharacter element if there
382         // is none, and separate them to two layers.
383         new PathTransformSpec(
384             "(.*ldml/exemplarCharacters)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
385         new PathTransformSpec("(.*ldml/exemplarCharacters)(.*)$", "$1/standard$2"),
386 
387         // Add cldrVersion attribute
388         new PathTransformSpec("(.+)/identity/version\\[@number=\"([^\"]*)\"\\]", "$1" + "/identity/version\\[@cldrVersion=\""
389             + CLDRFile.GEN_VERSION + "\"\\]"),
390         // Add cldrVersion attribute to supplemental data
391         new PathTransformSpec("(.+)/version\\[@number=\"([^\"]*)\"\\]\\[@unicodeVersion=\"([^\"]*\")(\\])", "$1" + "/version\\[@cldrVersion=\""
392             + CLDRFile.GEN_VERSION + "\"\\]" + "\\[@unicodeVersion=\"" + "$3" + "\\]"),
393 
394         // Transform underscore to hyphen-minus in language keys
395         new PathTransformSpec("(.*/language\\[@type=\"[a-z]{2,3})_([^\"]*\"\\](\\[@alt=\"short\"])?)", "$1-$2"),
396 
397         // Separate "ellipsis" from its type as another layer.
398         new PathTransformSpec("(.*/ellipsis)\\[@type=\"([^\"]*)\"\\](.*)$",
399             "$1/$2$3"),
400 
401         // Remove unnecessary dateFormat/pattern
402         new PathTransformSpec(
403             "(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)Length\\[@type=\"([^\"]*)\"\\]/(date|time|dateTime)Format\\[@type=\"([^\"]*)\"\\]/pattern\\[@type=\"([^\"]*)\"\\](.*)",
404             "$1/$2/$5Formats/$4$8"),
405 
406         // Separate calendar type
407         new PathTransformSpec("(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)$",
408             "$1/$2$3"),
409 
410         // Separate "metazone" from its type as another layer.
411         new PathTransformSpec("(.*/metazone)\\[@type=\"([^\"]*)\"\\]/(.*)$", "$1/$2/$3"),
412 
413         // Split out types into its various fields
414         new PathTransformSpec("(.*)/types/type\\[@key=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$",
415             "$1/types/$2/$3$4"),
416 
417         // Typographic
418         new PathTransformSpec("(.*)/(typographicNames)/(axisName|featureName)\\[@type=\"([^\"]*)\"\\](.*)$",
419             "$1/$2/$3s/$4$5"),
420         new PathTransformSpec("(.*)/(typographicNames)/(styleName)(.*)$",
421             "$1/$2/$3s/$3$4"),
422 
423         // put CharacterLabelPatterns under CharacterLabelPatterns
424         new PathTransformSpec("(.*)/(characterLabels)/(characterLabelPattern)(.*)$",
425             "$1/characterLabelPatterns/$3$4"),
426 
427         new PathTransformSpec(
428             "(.*/numbers/(decimal|scientific|percent|currency)Formats\\[@numberSystem=\"([^\"]*)\"\\])/(decimal|scientific|percent|currency)FormatLength/(decimal|scientific|percent|currency)Format\\[@type=\"standard\"]/pattern.*$",
429             "$1/standard"),
430 
431         new PathTransformSpec(
432             "(.*/numbers/currencyFormats\\[@numberSystem=\"([^\"]*)\"\\])/currencyFormatLength/currencyFormat\\[@type=\"accounting\"]/pattern.*$",
433             "$1/accounting"),
434         // Add "type" attribute with value "standard" if there is no "type" in
435         // "decimalFormatLength".
436         new PathTransformSpec(
437             "(.*/numbers/(decimal|scientific|percent)Formats\\[@numberSystem=\"([^\"]*)\"\\]/(decimal|scientific|percent)FormatLength)/(.*)$",
438             "$1[@type=\"standard\"]/$5"),
439 
440         new PathTransformSpec(
441             "(.*/listPattern)/(.*)$", "$1[@type=\"standard\"]/$2"),
442 
443         new PathTransformSpec("(.*/languagePopulation)\\[@type=\"([^\"]*)\"\\](.*)",
444             "$1/$2$3"),
445 
446         new PathTransformSpec("(.*/languageAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
447         new PathTransformSpec("(.*/scriptAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
448         new PathTransformSpec("(.*/territoryAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
449         new PathTransformSpec("(.*/subdivisionAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
450         new PathTransformSpec("(.*/variantAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
451         new PathTransformSpec("(.*/zoneAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"),
452         new PathTransformSpec("(.*/alias)(.*)", "$1/alias$2"),
453 
454         new PathTransformSpec("(.*currencyData/region)(.*)", "$1/region$2"),
455 
456         // Skip exemplar city in /etc/GMT or UTC timezones, since they don't have them.
457         new PathTransformSpec("(.*(GMT|UTC).*/exemplarCity)(.*)", ""),
458 
459         new PathTransformSpec("(.*/transforms/transform[^/]*)/(.*)", "$1/tRules/$2"),
460         new PathTransformSpec("(.*)\\[@territories=\"([^\"]*)\"\\](.*)\\[@alt=\"variant\"\\](.*)", "$1\\[@territories=\"$2-alt-variant\"\\]"),
461         new PathTransformSpec("(.*)/weekData/(.*)\\[@alt=\"variant\"\\](.*)", "$1/weekData/$2$3"),
462         new PathTransformSpec("(.*)/unitPreferenceData/unitPreferences\\[@category=\"([^\"]*)\"\\]\\[@usage=\"([^\"]*)\"\\](.*)",
463             "$1/unitPreferenceData/unitPreferences/$2/$3$4"),
464 
465         // Annotations
466         // If there is a type, move that into a sibling value
467         new PathTransformSpec("(.*)/(annotations)/(annotation)\\[@cp=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$",
468                                 "$1/$2/$4/$5$6"),
469         new PathTransformSpec("(.*)/(annotations)/(annotation)\\[@cp=\"([^\"]*)\"\\](.*)$",
470                                 "$1/$2/$4/default$5"),
471     };
472 }
473