1 package org.unicode.cldr.json; 2 3 import java.util.HashSet; 4 import java.util.Set; 5 import java.util.regex.Pattern; 6 7 import org.unicode.cldr.util.Builder; 8 import org.unicode.cldr.util.CLDRFile; 9 import org.unicode.cldr.util.PatternCache; 10 11 import com.google.common.collect.ImmutableSet; 12 13 class LdmlConvertRules { 14 15 /** File sets that will not be processed in JSON transformation. */ 16 public static final ImmutableSet<String> IGNORE_FILE_SET = ImmutableSet.of("attributeValueValidity", "coverageLevels", "postalCodeData", "pluralRanges", 17 "subdivisions"); 18 19 /** 20 * The attribute list that should become part of the name in form of 21 * name-(attribute)-(value). 22 * [parent_element]:[element]:[attribute] 23 */ 24 // common/main 25 static final ImmutableSet<String> NAME_PART_DISTINGUISHING_ATTR_SET = ImmutableSet.of( 26 "monthWidth:month:yeartype", 27 "characters:parseLenients:scope", 28 "dateFormat:pattern:numbers", 29 "currencyFormats:unitPattern:count", 30 "currency:displayName:count", 31 "numbers:symbols:numberSystem", 32 "numbers:decimalFormats:numberSystem", 33 "numbers:currencyFormats:numberSystem", 34 "numbers:percentFormats:numberSystem", 35 "numbers:scientificFormats:numberSystem", 36 "numbers:miscPatterns:numberSystem", 37 "minimalPairs:pluralMinimalPairs:count", 38 "territoryContainment:group:status", 39 "decimalFormat:pattern:count", 40 "currencyFormat:pattern:count", 41 "unit:unitPattern:count", 42 "field:relative:type", 43 "field:relativeTime:type", 44 "relativeTime:relativeTimePattern:count", 45 "availableFormats:dateFormatItem:count", 46 "listPatterns:listPattern:type", 47 "timeZoneNames:regionFormat:type", 48 "units:durationUnit:type", 49 "weekData:minDays:territories", 50 "weekData:firstDay:territories", 51 "weekData:weekendStart:territories", 52 "weekData:weekendEnd:territories", 53 "unitPreferenceDataData:unitPreferences:category", 54 "measurementData:measurementSystem:category", 55 "supplemental:plurals:type", 56 "pluralRules:pluralRule:count", 57 "languageMatches:languageMatch:desired"); 58 59 /** 60 * The set of attributes that should become part of the name in form of 61 * name-(attribute)-(value). 62 */ 63 64 /** 65 * Following is a list of element:attribute pair. These attributes should be 66 * treated as values. For example, 67 * <type type="arab" key="numbers">Arabic-Indic Digits</type> 68 * should be really converted as, 69 * "arab": { 70 * "_value": "Arabic-Indic Digits", 71 * "_key": "numbers" 72 * } 73 */ 74 static final ImmutableSet<String> ATTR_AS_VALUE_SET = ImmutableSet.of( 75 76 // in common/supplemental/dayPeriods.xml 77 "dayPeriodRules:dayPeriodRule:from", 78 79 // in common/supplemental/likelySubtags.xml 80 "likelySubtags:likelySubtag:to", 81 82 // in common/supplemental/metaZones.xml 83 "timezone:usesMetazone:mzone", 84 // Only the current usesMetazone will be kept, it is not necessary to keep 85 // "to" and "from" attributes to make key unique. This is needed as their 86 // value is not good if used as key. 87 "timezone:usesMetazone:to", 88 "timezone:usesMetazone:from", 89 90 "mapTimezones:mapZone:other", 91 "mapTimezones:mapZone:type", 92 "mapTimezones:mapZone:territory", 93 94 // in common/supplemental/numberingSystems.xml 95 "numberingSystems:numberingSystem:type", 96 97 // in common/supplemental/supplementalData.xml 98 "region:currency:from", 99 "region:currency:to", 100 "region:currency:tender", 101 "calendar:calendarSystem:type", 102 "codeMappings:territoryCodes:numeric", 103 "codeMappings:territoryCodes:alpha3", 104 "codeMappings:currencyCodes:numeric", 105 "timeData:hours:allowed", 106 "timeData:hours:preferred", 107 // common/supplemental/supplementalMetaData.xml 108 "validity:variable:type", 109 "deprecated:deprecatedItems:elements", 110 "deprecated:deprecatedItems:attributes", 111 "deprecated:deprecatedItems:type", 112 113 // in common/supplemental/telephoneCodeData.xml 114 "codesByTerritory:telephoneCountryCode:code", 115 116 // in common/supplemental/windowsZones.xml 117 "mapTimezones:mapZone:other", 118 119 // in common/bcp47/*.xml 120 "keyword:key:alias", 121 "key:type:alias", 122 "key:type:name", 123 124 // identity elements 125 "identity:language:type", 126 "identity:script:type", 127 "identity:territory:type", 128 "identity:variant:type"); 129 130 /** 131 * The set of element:attribute pair in which the attribute should be 132 * treated as value. All the attribute here are non-distinguishing attributes. 133 */ 134 135 /** 136 * For those attributes that are treated as values, they taken the form of 137 * element_name: { ..., attribute: value, ...} 138 * This is desirable as an element may have several attributes that are 139 * treated as values. But in some cases, there is one such attribute only, 140 * and it is more desirable to convert 141 * element_name: { attribute: value} 142 * to 143 * element_name: value 144 * With a solid example, 145 * <likelySubtag from="zh" to="zh_Hans_CN" /> 146 * distinguishing attr "from" will become the key, its better to 147 * omit "to" and have this simple mapping: 148 * "zh" : "zh_Hans_CN", 149 */ 150 static final ImmutableSet<String> COMPACTABLE_ATTR_AS_VALUE_SET = ImmutableSet.of( 151 // common/main 152 "calendars:default:choice", 153 "dateFormats:default:choice", 154 "months:default:choice", 155 "monthContext:default:choice", 156 "days:default:choice", 157 "dayContext:default:choice", 158 "timeFormats:default:choice", 159 "dateTimeFormats:default:choice", 160 "timeZoneNames:singleCountries:list", 161 162 //rbnf 163 "ruleset:rbnfrule:value", 164 // common/supplemental 165 "likelySubtags:likelySubtag:to", 166 //"territoryContainment:group:type", 167 "calendar:calendarSystem:type", 168 "calendarPreferenceData:calendarPreference:ordering", 169 "codesByTerritory:telephoneCountryCode:code", 170 171 // common/collation 172 "collations:default:choice", 173 174 // identity elements 175 "identity:language:type", 176 "identity:script:type", 177 "identity:territory:type", 178 "identity:variant:type"); 179 180 /** 181 * The set of attributes that should be treated as value, and reduce to 182 * simple value only form. 183 */ 184 185 /** 186 * Anonymous key name. 187 */ 188 public static final String ANONYMOUS_KEY = "_"; 189 190 /** 191 * Check if the attribute should be suppressed. 192 * 193 * Right now only "_q" is suppressed. In most cases array is used and there 194 * is no need for this information. In other cases, order is irrelevant. 195 * 196 * @return True if the attribute should be suppressed. 197 */ IsSuppresedAttr(String attr)198 public static boolean IsSuppresedAttr(String attr) { 199 return attr.endsWith("_q") || attr.endsWith("-q"); 200 } 201 202 /** 203 * The set of attributes that should be ignored in the conversion process. 204 */ 205 public static final ImmutableSet<String> IGNORABLE_NONDISTINGUISHING_ATTR_SET = ImmutableSet.of("draft", "references"); 206 207 /** 208 * List of attributes that should be suppressed. 209 * This list comes form cldr/common/supplemental/supplementalMetadata. Each 210 * three of them is a group, they are for element, value and attribute. 211 * If the specified attribute appears in specified element with specified = 212 * value, it should be suppressed. 213 */ 214 public static final String[] ATTR_SUPPRESS_LIST = { 215 // common/main 216 "dateFormat", "standard", "type", 217 "dateTimeFormat", "standard", "type", 218 "timeFormat", "standard", "type", 219 "decimalFormat", "standard", "type", 220 "percentFormat", "standard", "type", 221 "scientificFormat", "standard", "type", 222 "pattern", "standard", "type", 223 }; 224 225 /** 226 * This is a simple class to hold the splittable attribute specification. 227 */ 228 public static class SplittableAttributeSpec { 229 public String element; 230 public String attribute; 231 public String attrAsValueAfterSplit; 232 SplittableAttributeSpec(String el, String attr, String av)233 SplittableAttributeSpec(String el, String attr, String av) { 234 element = el; 235 attribute = attr; 236 attrAsValueAfterSplit = av; 237 } 238 } 239 240 /** 241 * List of attributes that has value that can be split. Each two of them is a 242 * group, and represent element and value. Occurrences of such match should 243 * lead to creation of multiple node. 244 * Example: 245 * <weekendStart day="thu" territories="DZ KW OM SA SD YE AF IR"/> 246 * should be treated as if following node is encountered. 247 * <weekendStart day="thu" territories="DZ"/> 248 * <weekendStart day="thu" territories="KW"/> 249 * <weekendStart day="thu" territories="OM"/> 250 * <weekendStart day="thu" territories="SA"/> 251 * <weekendStart day="thu" territories="SD"/> 252 * <weekendStart day="thu" territories="YE"/> 253 * <weekendStart day="thu" territories="AF"/> 254 * <weekendStart day="thu" territories="IR"/> 255 */ 256 public static final SplittableAttributeSpec[] SPLITTABLE_ATTRS = { 257 new SplittableAttributeSpec("calendarPreference", "territories", null), 258 new SplittableAttributeSpec("pluralRules", "locales", null), 259 new SplittableAttributeSpec("minDays", "territories", "count"), 260 new SplittableAttributeSpec("firstDay", "territories", "day"), 261 new SplittableAttributeSpec("weekendStart", "territories", "day"), 262 new SplittableAttributeSpec("weekendEnd", "territories", "day"), 263 new SplittableAttributeSpec("measurementSystem", "territories", "type"), 264 new SplittableAttributeSpec("measurementSystem-category-temperature", "territories", "type"), 265 new SplittableAttributeSpec("paperSize", "territories", "type"), 266 new SplittableAttributeSpec("parentLocale", "locales", "parent"), 267 new SplittableAttributeSpec("hours", "regions", null), 268 new SplittableAttributeSpec("dayPeriodRules", "locales", null), 269 // new SplittableAttributeSpec("group", "contains", "group"), 270 new SplittableAttributeSpec("personList", "locales", "type"), 271 new SplittableAttributeSpec("unitPreference", "regions", null) 272 }; 273 274 /** 275 * The set that contains all timezone type of elements. 276 */ 277 public static final Set<String> TIMEZONE_ELEMENT_NAME_SET = Builder.with(new HashSet<String>()) 278 .add("zone").add("timezone") 279 .add("zoneItem").add("typeMap").freeze(); 280 281 /** 282 * There are a handful of attribute values that are more properly represented as an array of strings rather than 283 * as a single string. 284 */ 285 public static final Set<String> ATTRVALUE_AS_ARRAY_SET = Builder.with(new HashSet<String>()) 286 .add("territories").add("scripts").add("contains").freeze(); 287 288 /** 289 * Following is the list of elements that need to be sorted before output. 290 * 291 * Time zone item is split to multiple level, and each level should be 292 * grouped together. The locale list in "dayPeriodRule" could be split to 293 * multiple items, and items for each locale should be grouped together. 294 */ 295 public static final String[] ELEMENT_NEED_SORT = { 296 "zone", "timezone", "zoneItem", "typeMap", "dayPeriodRule", 297 "pluralRules", "personList", "calendarPreferenceData", "character-fallback", "types", "timeData", "minDays", 298 "firstDay", "weekendStart", "weekendEnd", "measurementData", "measurementSystem" 299 }; 300 301 /** 302 * Some elements in CLDR has multiple children of the same type of element. 303 * We would like to treat them as array. 304 */ 305 public static final Pattern ARRAY_ITEM_PATTERN = PatternCache.get( 306 "(.*/collation[^/]*/rules[^/]*/" + 307 "|.*/character-fallback[^/]*/character[^/]*/" + 308 "|.*/rbnfrule[^/]*/" + 309 "|.*/ruleset[^/]*/" + 310 "|.*/languageMatching[^/]*/languageMatches[^/]*/" + 311 "|.*/windowsZones[^/]*/mapTimezones[^/]*/" + 312 "|.*/metaZones[^/]*/mapTimezones[^/]*/" + 313 "|.*/segmentation[^/]*/variables[^/]*/" + 314 "|.*/segmentation[^/]*/suppressions[^/]*/" + 315 "|.*/transform[^/]*/tRules[^/]*/" + 316 "|.*/region/region[^/]*/" + 317 "|.*/keyword[^/]*/key[^/]*/" + 318 "|.*/telephoneCodeData[^/]*/codesByTerritory[^/]*/" + 319 "|.*/metazoneInfo[^/]*/timezone\\[[^\\]]*\\]/" + 320 "|.*/metadata[^/]*/validity[^/]*/" + 321 "|.*/metadata[^/]*/suppress[^/]*/" + 322 "|.*/metadata[^/]*/deprecated[^/]*/" + 323 ")(.*)"); 324 325 /** 326 * Number elements without a numbering system are there only for compatibility purposes. 327 * We automatically suppress generation of JSON objects for them. 328 */ 329 public static final Pattern NO_NUMBERING_SYSTEM_PATTERN = Pattern 330 .compile("//ldml/numbers/(symbols|(decimal|percent|scientific|currency)Formats)/.*"); 331 public static final Pattern NUMBERING_SYSTEM_PATTERN = Pattern 332 .compile("//ldml/numbers/(symbols|miscPatterns|(decimal|percent|scientific|currency)Formats)\\[@numberSystem=\"([^\"]++)\"\\]/.*"); 333 public static final String[] ACTIVE_NUMBERING_SYSTEM_XPATHS = { 334 "//ldml/numbers/defaultNumberingSystem", 335 "//ldml/numbers/otherNumberingSystems/native", 336 "//ldml/numbers/otherNumberingSystems/traditional", 337 "//ldml/numbers/otherNumberingSystems/finance" 338 }; 339 340 /** 341 * Root language id pattern should be discarded in all locales except root, 342 * even though the path will exist in a resolved CLDRFile. 343 */ 344 public static final Pattern ROOT_IDENTITY_PATTERN = Pattern 345 .compile("//ldml/identity/language\\[@type=\"root\"\\]"); 346 347 /** 348 * A simple class to hold the specification of a path transformation. 349 */ 350 public static class PathTransformSpec { 351 public Pattern pattern; 352 public String replacement; 353 PathTransformSpec(String patternStr, String replacement)354 PathTransformSpec(String patternStr, String replacement) { 355 pattern = PatternCache.get(patternStr); 356 this.replacement = replacement; 357 } 358 } 359 360 /** 361 * Some special transformation, like add an additional layer, can be easily 362 * done by transforming the path. Following rules covers these kind of 363 * transformation. 364 * Note: It is important to keep the order for these rules. Whenever a 365 * rule matches, further rule won't be applied. 366 */ 367 public static final PathTransformSpec PATH_TRANSFORMATIONS[] = { 368 // Add "standard" as type attribute to exemplarCharacter element if there 369 // is none, and separate them to two layers. 370 new PathTransformSpec( 371 "(.*ldml/exemplarCharacters)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 372 new PathTransformSpec("(.*ldml/exemplarCharacters)(.*)$", "$1/standard$2"), 373 374 // Add cldrVersion attribute 375 new PathTransformSpec("(.*/identity/version\\[@number=\"([^\"]*)\")(\\])", "$1" + "\\]\\[@cldrVersion=\"" 376 + CLDRFile.GEN_VERSION + "\"\\]"), 377 // Add cldrVersion attribute to supplemental data 378 new PathTransformSpec("(.*/version\\[@number=\"([^\"]*)\")(\\])\\[@unicodeVersion=\"([^\"]*\")(\\])", "$1" + "\\]\\[@cldrVersion=\"" 379 + CLDRFile.GEN_VERSION + "\"\\]" + "\\[@unicodeVersion=\"" + "$4" + "\\]"), 380 381 // Transform underscore to hyphen-minus in language keys 382 new PathTransformSpec("(.*/language\\[@type=\"[a-z]{2,3})_([^\"]*\"\\](\\[@alt=\"short\"])?)", "$1-$2"), 383 384 // Separate "ellipsis" from its type as another layer. 385 new PathTransformSpec("(.*/ellipsis)\\[@type=\"([^\"]*)\"\\](.*)$", 386 "$1/$2$3"), 387 388 // Remove unnecessary dateFormat/pattern 389 new PathTransformSpec( 390 "(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)Length\\[@type=\"([^\"]*)\"\\]/(date|time|dateTime)Format\\[@type=\"([^\"]*)\"\\]/pattern\\[@type=\"([^\"]*)\"\\](.*)", 391 "$1/$2/$5Formats/$4$8"), 392 393 // Separate calendar type 394 new PathTransformSpec("(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)$", 395 "$1/$2$3"), 396 397 // Separate "metazone" from its type as another layer. 398 new PathTransformSpec("(.*/metazone)\\[@type=\"([^\"]*)\"\\]/(.*)$", "$1/$2/$3"), 399 400 // Split out types into its various fields 401 new PathTransformSpec("(.*)/types/type\\[@key=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$", 402 "$1/types/$2/$3$4"), 403 404 new PathTransformSpec( 405 "(.*/numbers/(decimal|scientific|percent|currency)Formats\\[@numberSystem=\"([^\"]*)\"\\])/(decimal|scientific|percent|currency)FormatLength/(decimal|scientific|percent|currency)Format\\[@type=\"standard\"]/pattern.*$", 406 "$1/standard"), 407 408 new PathTransformSpec( 409 "(.*/numbers/currencyFormats\\[@numberSystem=\"([^\"]*)\"\\])/currencyFormatLength/currencyFormat\\[@type=\"accounting\"]/pattern.*$", 410 "$1/accounting"), 411 // Add "type" attribute with value "standard" if there is no "type" in 412 // "decimalFormatLength". 413 new PathTransformSpec( 414 "(.*/numbers/(decimal|scientific|percent)Formats\\[@numberSystem=\"([^\"]*)\"\\]/(decimal|scientific|percent)FormatLength)/(.*)$", 415 "$1[@type=\"standard\"]/$5"), 416 417 new PathTransformSpec( 418 "(.*/listPattern)/(.*)$", "$1[@type=\"standard\"]/$2"), 419 420 new PathTransformSpec("(.*/languagePopulation)\\[@type=\"([^\"]*)\"\\](.*)", 421 "$1/$2$3"), 422 423 new PathTransformSpec("(.*/languageAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 424 new PathTransformSpec("(.*/scriptAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 425 new PathTransformSpec("(.*/territoryAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 426 new PathTransformSpec("(.*/variantAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 427 new PathTransformSpec("(.*/zoneAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 428 new PathTransformSpec("(.*/alias)(.*)", "$1/alias$2"), 429 430 new PathTransformSpec("(.*currencyData/region)(.*)", "$1/region$2"), 431 432 // Skip exemplar city in /etc/GMT or UTC timezones, since they don't have them. 433 new PathTransformSpec("(.*(GMT|UTC).*/exemplarCity)(.*)", ""), 434 435 new PathTransformSpec("(.*/transforms/transform[^/]*)/(.*)", "$1/tRules/$2"), 436 new PathTransformSpec("(.*)\\[@territories=\"([^\"]*)\"\\](.*)\\[@alt=\"variant\"\\](.*)", "$1\\[@territories=\"$2-alt-variant\"\\]"), 437 new PathTransformSpec("(.*)/weekData/(.*)\\[@alt=\"variant\"\\](.*)", "$1/weekData/$2$3"), 438 new PathTransformSpec("(.*)/unitPreferenceData/unitPreferences\\[@category=\"([^\"]*)\"\\]\\[@usage=\"([^\"]*)\"\\](.*)", 439 "$1/unitPreferenceData/unitPreferences/$2/$3$4"), 440 441 }; 442 } 443