1 package org.unicode.cldr.json; 2 3 import java.util.HashSet; 4 import java.util.Set; 5 import java.util.regex.Pattern; 6 7 import org.unicode.cldr.util.Builder; 8 import org.unicode.cldr.util.CLDRFile; 9 import org.unicode.cldr.util.PatternCache; 10 11 import com.google.common.collect.ImmutableSet; 12 13 class LdmlConvertRules { 14 15 /** File sets that will not be processed in JSON transformation. */ 16 public static final ImmutableSet<String> IGNORE_FILE_SET = ImmutableSet.of("attributeValueValidity", "coverageLevels", "grammaticalFeatures", "postalCodeData", 17 "subdivisions", "units"); 18 19 /** 20 * The attribute list that should become part of the name in form of 21 * name-(attribute)-(value). 22 * [parent_element]:[element]:[attribute] 23 */ 24 // common/main 25 static final ImmutableSet<String> NAME_PART_DISTINGUISHING_ATTR_SET = ImmutableSet.of( 26 "monthWidth:month:yeartype", 27 "characters:parseLenients:scope", 28 "dateFormat:pattern:numbers", 29 "characterLabelPatterns:characterLabelPattern:count", // originally under characterLabels 30 "currencyFormats:unitPattern:count", 31 "currency:displayName:count", 32 "numbers:symbols:numberSystem", 33 "numbers:decimalFormats:numberSystem", 34 "numbers:currencyFormats:numberSystem", 35 "numbers:percentFormats:numberSystem", 36 "numbers:scientificFormats:numberSystem", 37 "numbers:miscPatterns:numberSystem", 38 "minimalPairs:pluralMinimalPairs:count", 39 "territoryContainment:group:status", 40 "decimalFormat:pattern:count", 41 "currencyFormat:pattern:count", 42 "unit:unitPattern:count", 43 // compound units 44 "compoundUnit:compoundUnitPattern1:count", 45 "compoundUnit:compoundUnitPattern1:gender", 46 "compoundUnit:compoundUnitPattern1:case", 47 "field:relative:type", 48 "field:relativeTime:type", 49 "relativeTime:relativeTimePattern:count", 50 "availableFormats:dateFormatItem:count", 51 "listPatterns:listPattern:type", 52 "timeZoneNames:regionFormat:type", 53 "units:durationUnit:type", 54 "weekData:minDays:territories", 55 "weekData:firstDay:territories", 56 "weekData:weekendStart:territories", 57 "weekData:weekendEnd:territories", 58 "unitPreferenceDataData:unitPreferences:category", 59 "measurementData:measurementSystem:category", 60 "supplemental:plurals:type", 61 "pluralRanges:pluralRange:start", 62 "pluralRanges:pluralRange:end", 63 "pluralRules:pluralRule:count", 64 "languageMatches:languageMatch:desired", 65 "styleNames:styleName:subtype", 66 "styleNames:styleName:alt"); 67 68 /** 69 * The set of attributes that should become part of the name in form of 70 * name-(attribute)-(value). 71 */ 72 73 /** 74 * Following is a list of element:attribute pair. These attributes should be 75 * treated as values. For example, 76 * <type type="arab" key="numbers">Arabic-Indic Digits</type> 77 * should be really converted as, 78 * "arab": { 79 * "_value": "Arabic-Indic Digits", 80 * "_key": "numbers" 81 * } 82 */ 83 static final ImmutableSet<String> ATTR_AS_VALUE_SET = ImmutableSet.of( 84 85 // in common/supplemental/dayPeriods.xml 86 "dayPeriodRules:dayPeriodRule:from", 87 88 // in common/supplemental/likelySubtags.xml 89 "likelySubtags:likelySubtag:to", 90 91 // in common/supplemental/metaZones.xml 92 "timezone:usesMetazone:mzone", 93 // Only the current usesMetazone will be kept, it is not necessary to keep 94 // "to" and "from" attributes to make key unique. This is needed as their 95 // value is not good if used as key. 96 "timezone:usesMetazone:to", 97 "timezone:usesMetazone:from", 98 99 "mapTimezones:mapZone:other", 100 "mapTimezones:mapZone:type", 101 "mapTimezones:mapZone:territory", 102 103 // in common/supplemental/numberingSystems.xml 104 "numberingSystems:numberingSystem:type", 105 106 // in common/supplemental/supplementalData.xml 107 "region:currency:from", 108 "region:currency:to", 109 "region:currency:tender", 110 "calendar:calendarSystem:type", 111 "codeMappings:territoryCodes:numeric", 112 "codeMappings:territoryCodes:alpha3", 113 "codeMappings:currencyCodes:numeric", 114 "timeData:hours:allowed", 115 "timeData:hours:preferred", 116 // common/supplemental/supplementalMetaData.xml 117 "validity:variable:type", 118 "deprecated:deprecatedItems:elements", 119 "deprecated:deprecatedItems:attributes", 120 "deprecated:deprecatedItems:type", 121 122 // in common/supplemental/telephoneCodeData.xml 123 "codesByTerritory:telephoneCountryCode:code", 124 125 // in common/supplemental/windowsZones.xml 126 "mapTimezones:mapZone:other", 127 128 // in common/bcp47/*.xml 129 "keyword:key:alias", 130 "key:type:alias", 131 "key:type:name", 132 133 // identity elements 134 "identity:language:type", 135 "identity:script:type", 136 "identity:territory:type", 137 "identity:variant:type"); 138 139 /** 140 * The set of element:attribute pair in which the attribute should be 141 * treated as value. All the attribute here are non-distinguishing attributes. 142 */ 143 144 /** 145 * For those attributes that are treated as values, they taken the form of 146 * element_name: { ..., attribute: value, ...} 147 * This is desirable as an element may have several attributes that are 148 * treated as values. But in some cases, there is one such attribute only, 149 * and it is more desirable to convert 150 * element_name: { attribute: value} 151 * to 152 * element_name: value 153 * With a solid example, 154 * <likelySubtag from="zh" to="zh_Hans_CN" /> 155 * distinguishing attr "from" will become the key, its better to 156 * omit "to" and have this simple mapping: 157 * "zh" : "zh_Hans_CN", 158 */ 159 static final ImmutableSet<String> COMPACTABLE_ATTR_AS_VALUE_SET = ImmutableSet.of( 160 // common/main 161 "calendars:default:choice", 162 "dateFormats:default:choice", 163 "months:default:choice", 164 "monthContext:default:choice", 165 "days:default:choice", 166 "dayContext:default:choice", 167 "timeFormats:default:choice", 168 "dateTimeFormats:default:choice", 169 "timeZoneNames:singleCountries:list", 170 171 //rbnf 172 "ruleset:rbnfrule:value", 173 // common/supplemental 174 "likelySubtags:likelySubtag:to", 175 //"territoryContainment:group:type", 176 "calendar:calendarSystem:type", 177 "calendarPreferenceData:calendarPreference:ordering", 178 "codesByTerritory:telephoneCountryCode:code", 179 180 // common/collation 181 "collations:default:choice", 182 183 // common/supplemental/pluralRanges.xml 184 "pluralRanges:pluralRange:result", 185 186 // identity elements 187 "identity:language:type", 188 "identity:script:type", 189 "identity:territory:type", 190 "identity:variant:type"); 191 192 /** 193 * The set of attributes that should be treated as value, and reduce to 194 * simple value only form. 195 */ 196 197 /** 198 * Anonymous key name. 199 */ 200 public static final String ANONYMOUS_KEY = "_"; 201 202 /** 203 * Check if the attribute should be suppressed. 204 * 205 * Right now only "_q" is suppressed. In most cases array is used and there 206 * is no need for this information. In other cases, order is irrelevant. 207 * 208 * @return True if the attribute should be suppressed. 209 */ IsSuppresedAttr(String attr)210 public static boolean IsSuppresedAttr(String attr) { 211 return attr.endsWith("_q") || attr.endsWith("-q"); 212 } 213 214 /** 215 * The set of attributes that should be ignored in the conversion process. 216 */ 217 public static final ImmutableSet<String> IGNORABLE_NONDISTINGUISHING_ATTR_SET = ImmutableSet.of("draft", "references"); 218 219 /** 220 * List of attributes that should be suppressed. 221 * This list comes form cldr/common/supplemental/supplementalMetadata. Each 222 * three of them is a group, they are for element, value and attribute. 223 * If the specified attribute appears in specified element with specified = 224 * value, it should be suppressed. 225 */ 226 public static final String[] ATTR_SUPPRESS_LIST = { 227 // common/main 228 "dateFormat", "standard", "type", 229 "dateTimeFormat", "standard", "type", 230 "timeFormat", "standard", "type", 231 "decimalFormat", "standard", "type", 232 "percentFormat", "standard", "type", 233 "scientificFormat", "standard", "type", 234 "pattern", "standard", "type", 235 }; 236 237 /** 238 * This is a simple class to hold the splittable attribute specification. 239 */ 240 public static class SplittableAttributeSpec { 241 public String element; 242 public String attribute; 243 public String attrAsValueAfterSplit; 244 SplittableAttributeSpec(String el, String attr, String av)245 SplittableAttributeSpec(String el, String attr, String av) { 246 element = el; 247 attribute = attr; 248 attrAsValueAfterSplit = av; 249 } 250 } 251 252 /** 253 * List of attributes that has value that can be split. Each two of them is a 254 * group, and represent element and value. Occurrences of such match should 255 * lead to creation of multiple node. 256 * Example: 257 * <weekendStart day="thu" territories="DZ KW OM SA SD YE AF IR"/> 258 * should be treated as if following node is encountered. 259 * <weekendStart day="thu" territories="DZ"/> 260 * <weekendStart day="thu" territories="KW"/> 261 * <weekendStart day="thu" territories="OM"/> 262 * <weekendStart day="thu" territories="SA"/> 263 * <weekendStart day="thu" territories="SD"/> 264 * <weekendStart day="thu" territories="YE"/> 265 * <weekendStart day="thu" territories="AF"/> 266 * <weekendStart day="thu" territories="IR"/> 267 */ 268 public static final SplittableAttributeSpec[] SPLITTABLE_ATTRS = { 269 new SplittableAttributeSpec("calendarPreference", "territories", null), 270 new SplittableAttributeSpec("pluralRanges", "locales", null), 271 new SplittableAttributeSpec("pluralRules", "locales", null), 272 new SplittableAttributeSpec("minDays", "territories", "count"), 273 new SplittableAttributeSpec("firstDay", "territories", "day"), 274 new SplittableAttributeSpec("weekendStart", "territories", "day"), 275 new SplittableAttributeSpec("weekendEnd", "territories", "day"), 276 new SplittableAttributeSpec("measurementSystem", "territories", "type"), 277 new SplittableAttributeSpec("measurementSystem-category-temperature", "territories", "type"), 278 new SplittableAttributeSpec("paperSize", "territories", "type"), 279 new SplittableAttributeSpec("parentLocale", "locales", "parent"), 280 new SplittableAttributeSpec("hours", "regions", null), 281 new SplittableAttributeSpec("dayPeriodRules", "locales", null), 282 // new SplittableAttributeSpec("group", "contains", "group"), 283 new SplittableAttributeSpec("personList", "locales", "type"), 284 new SplittableAttributeSpec("unitPreference", "regions", null) 285 }; 286 287 /** 288 * The set that contains all timezone type of elements. 289 */ 290 public static final Set<String> TIMEZONE_ELEMENT_NAME_SET = Builder.with(new HashSet<String>()) 291 .add("zone").add("timezone") 292 .add("zoneItem").add("typeMap").freeze(); 293 294 /** 295 * There are a handful of attribute values that are more properly represented as an array of strings rather than 296 * as a single string. 297 */ 298 public static final Set<String> ATTRVALUE_AS_ARRAY_SET = Builder.with(new HashSet<String>()) 299 .add("territories").add("scripts").add("contains").freeze(); 300 301 /** 302 * Following is the list of elements that need to be sorted before output. 303 * 304 * Time zone item is split to multiple level, and each level should be 305 * grouped together. The locale list in "dayPeriodRule" could be split to 306 * multiple items, and items for each locale should be grouped together. 307 */ 308 public static final String[] ELEMENT_NEED_SORT = { 309 "zone", "timezone", "zoneItem", "typeMap", "dayPeriodRule", "pluralRanges", 310 "pluralRules", "personList", "calendarPreferenceData", "character-fallback", "types", "timeData", "minDays", 311 "firstDay", "weekendStart", "weekendEnd", "measurementData", "measurementSystem" 312 }; 313 314 /** 315 * Some elements in CLDR has multiple children of the same type of element. 316 * We would like to treat them as array. 317 */ 318 public static final Pattern ARRAY_ITEM_PATTERN = PatternCache.get( 319 "(.*/collation[^/]*/rules[^/]*/" + 320 "|.*/character-fallback[^/]*/character[^/]*/" + 321 "|.*/rbnfrule[^/]*/" + 322 "|.*/ruleset[^/]*/" + 323 "|.*/languageMatching[^/]*/languageMatches[^/]*/" + 324 "|.*/windowsZones[^/]*/mapTimezones[^/]*/" + 325 "|.*/metaZones[^/]*/mapTimezones[^/]*/" + 326 "|.*/segmentation[^/]*/variables[^/]*/" + 327 "|.*/segmentation[^/]*/suppressions[^/]*/" + 328 "|.*/transform[^/]*/tRules[^/]*/" + 329 "|.*/region/region[^/]*/" + 330 "|.*/keyword[^/]*/key[^/]*/" + 331 "|.*/telephoneCodeData[^/]*/codesByTerritory[^/]*/" + 332 "|.*/metazoneInfo[^/]*/timezone\\[[^\\]]*\\]/" + 333 "|.*/metadata[^/]*/validity[^/]*/" + 334 "|.*/metadata[^/]*/suppress[^/]*/" + 335 "|.*/metadata[^/]*/deprecated[^/]*/" + 336 ")(.*)"); 337 338 /** 339 * Number elements without a numbering system are there only for compatibility purposes. 340 * We automatically suppress generation of JSON objects for them. 341 */ 342 public static final Pattern NO_NUMBERING_SYSTEM_PATTERN = Pattern 343 .compile("//ldml/numbers/(symbols|(decimal|percent|scientific|currency)Formats)/.*"); 344 public static final Pattern NUMBERING_SYSTEM_PATTERN = Pattern 345 .compile("//ldml/numbers/(symbols|miscPatterns|(decimal|percent|scientific|currency)Formats)\\[@numberSystem=\"([^\"]++)\"\\]/.*"); 346 public static final String[] ACTIVE_NUMBERING_SYSTEM_XPATHS = { 347 "//ldml/numbers/defaultNumberingSystem", 348 "//ldml/numbers/otherNumberingSystems/native", 349 "//ldml/numbers/otherNumberingSystems/traditional", 350 "//ldml/numbers/otherNumberingSystems/finance" 351 }; 352 353 /** 354 * Root language id pattern should be discarded in all locales except root, 355 * even though the path will exist in a resolved CLDRFile. 356 */ 357 public static final Pattern ROOT_IDENTITY_PATTERN = Pattern 358 .compile("//ldml/identity/language\\[@type=\"root\"\\]"); 359 360 /** 361 * A simple class to hold the specification of a path transformation. 362 */ 363 public static class PathTransformSpec { 364 public Pattern pattern; 365 public String replacement; 366 PathTransformSpec(String patternStr, String replacement)367 PathTransformSpec(String patternStr, String replacement) { 368 pattern = PatternCache.get(patternStr); 369 this.replacement = replacement; 370 } 371 } 372 373 /** 374 * Some special transformation, like add an additional layer, can be easily 375 * done by transforming the path. Following rules covers these kind of 376 * transformation. 377 * Note: It is important to keep the order for these rules. Whenever a 378 * rule matches, further rules won't be applied. 379 */ 380 public static final PathTransformSpec PATH_TRANSFORMATIONS[] = { 381 // Add "standard" as type attribute to exemplarCharacter element if there 382 // is none, and separate them to two layers. 383 new PathTransformSpec( 384 "(.*ldml/exemplarCharacters)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 385 new PathTransformSpec("(.*ldml/exemplarCharacters)(.*)$", "$1/standard$2"), 386 387 // Add cldrVersion attribute 388 new PathTransformSpec("(.+)/identity/version\\[@number=\"([^\"]*)\"\\]", "$1" + "/identity/version\\[@cldrVersion=\"" 389 + CLDRFile.GEN_VERSION + "\"\\]"), 390 // Add cldrVersion attribute to supplemental data 391 new PathTransformSpec("(.+)/version\\[@number=\"([^\"]*)\"\\]\\[@unicodeVersion=\"([^\"]*\")(\\])", "$1" + "/version\\[@cldrVersion=\"" 392 + CLDRFile.GEN_VERSION + "\"\\]" + "\\[@unicodeVersion=\"" + "$3" + "\\]"), 393 394 // Transform underscore to hyphen-minus in language keys 395 new PathTransformSpec("(.*/language\\[@type=\"[a-z]{2,3})_([^\"]*\"\\](\\[@alt=\"short\"])?)", "$1-$2"), 396 397 // Separate "ellipsis" from its type as another layer. 398 new PathTransformSpec("(.*/ellipsis)\\[@type=\"([^\"]*)\"\\](.*)$", 399 "$1/$2$3"), 400 401 // Remove unnecessary dateFormat/pattern 402 new PathTransformSpec( 403 "(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)Length\\[@type=\"([^\"]*)\"\\]/(date|time|dateTime)Format\\[@type=\"([^\"]*)\"\\]/pattern\\[@type=\"([^\"]*)\"\\](.*)", 404 "$1/$2/$5Formats/$4$8"), 405 406 // Separate calendar type 407 new PathTransformSpec("(.*/calendars)/calendar\\[@type=\"([^\"]*)\"\\](.*)$", 408 "$1/$2$3"), 409 410 // Separate "metazone" from its type as another layer. 411 new PathTransformSpec("(.*/metazone)\\[@type=\"([^\"]*)\"\\]/(.*)$", "$1/$2/$3"), 412 413 // Split out types into its various fields 414 new PathTransformSpec("(.*)/types/type\\[@key=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$", 415 "$1/types/$2/$3$4"), 416 417 // Typographic 418 new PathTransformSpec("(.*)/(typographicNames)/(axisName|featureName)\\[@type=\"([^\"]*)\"\\](.*)$", 419 "$1/$2/$3s/$4$5"), 420 new PathTransformSpec("(.*)/(typographicNames)/(styleName)(.*)$", 421 "$1/$2/$3s/$3$4"), 422 423 // put CharacterLabelPatterns under CharacterLabelPatterns 424 new PathTransformSpec("(.*)/(characterLabels)/(characterLabelPattern)(.*)$", 425 "$1/characterLabelPatterns/$3$4"), 426 427 new PathTransformSpec( 428 "(.*/numbers/(decimal|scientific|percent|currency)Formats\\[@numberSystem=\"([^\"]*)\"\\])/(decimal|scientific|percent|currency)FormatLength/(decimal|scientific|percent|currency)Format\\[@type=\"standard\"]/pattern.*$", 429 "$1/standard"), 430 431 new PathTransformSpec( 432 "(.*/numbers/currencyFormats\\[@numberSystem=\"([^\"]*)\"\\])/currencyFormatLength/currencyFormat\\[@type=\"accounting\"]/pattern.*$", 433 "$1/accounting"), 434 // Add "type" attribute with value "standard" if there is no "type" in 435 // "decimalFormatLength". 436 new PathTransformSpec( 437 "(.*/numbers/(decimal|scientific|percent)Formats\\[@numberSystem=\"([^\"]*)\"\\]/(decimal|scientific|percent)FormatLength)/(.*)$", 438 "$1[@type=\"standard\"]/$5"), 439 440 new PathTransformSpec( 441 "(.*/listPattern)/(.*)$", "$1[@type=\"standard\"]/$2"), 442 443 new PathTransformSpec("(.*/languagePopulation)\\[@type=\"([^\"]*)\"\\](.*)", 444 "$1/$2$3"), 445 446 new PathTransformSpec("(.*/languageAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 447 new PathTransformSpec("(.*/scriptAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 448 new PathTransformSpec("(.*/territoryAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 449 new PathTransformSpec("(.*/subdivisionAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 450 new PathTransformSpec("(.*/variantAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 451 new PathTransformSpec("(.*/zoneAlias)\\[@type=\"([^\"]*)\"\\](.*)", "$1/$2$3"), 452 new PathTransformSpec("(.*/alias)(.*)", "$1/alias$2"), 453 454 new PathTransformSpec("(.*currencyData/region)(.*)", "$1/region$2"), 455 456 // Skip exemplar city in /etc/GMT or UTC timezones, since they don't have them. 457 new PathTransformSpec("(.*(GMT|UTC).*/exemplarCity)(.*)", ""), 458 459 new PathTransformSpec("(.*/transforms/transform[^/]*)/(.*)", "$1/tRules/$2"), 460 new PathTransformSpec("(.*)\\[@territories=\"([^\"]*)\"\\](.*)\\[@alt=\"variant\"\\](.*)", "$1\\[@territories=\"$2-alt-variant\"\\]"), 461 new PathTransformSpec("(.*)/weekData/(.*)\\[@alt=\"variant\"\\](.*)", "$1/weekData/$2$3"), 462 new PathTransformSpec("(.*)/unitPreferenceData/unitPreferences\\[@category=\"([^\"]*)\"\\]\\[@usage=\"([^\"]*)\"\\](.*)", 463 "$1/unitPreferenceData/unitPreferences/$2/$3$4"), 464 465 // Annotations 466 // If there is a type, move that into a sibling value 467 new PathTransformSpec("(.*)/(annotations)/(annotation)\\[@cp=\"([^\"]*)\"\\]\\[@type=\"([^\"]*)\"\\](.*)$", 468 "$1/$2/$4/$5$6"), 469 new PathTransformSpec("(.*)/(annotations)/(annotation)\\[@cp=\"([^\"]*)\"\\](.*)$", 470 "$1/$2/$4/default$5"), 471 }; 472 } 473