1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.mapper; 4 5 import static com.google.common.base.Ascii.toLowerCase; 6 import static com.google.common.base.Preconditions.checkState; 7 import static org.unicode.cldr.api.AttributeKey.keyOf; 8 import static org.unicode.cldr.api.CldrData.PathOrder.DTD; 9 import static org.unicode.cldr.api.CldrDataType.BCP47; 10 11 import java.util.LinkedHashMap; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Optional; 16 import java.util.Set; 17 18 import org.unicode.cldr.api.AttributeKey; 19 import org.unicode.cldr.api.CldrData; 20 import org.unicode.cldr.api.CldrDataSupplier; 21 import org.unicode.cldr.api.CldrDataType; 22 import org.unicode.cldr.api.CldrPath; 23 import org.unicode.cldr.api.CldrValue; 24 import org.unicode.icu.tool.cldrtoicu.IcuData; 25 import org.unicode.icu.tool.cldrtoicu.RbPath; 26 import org.unicode.icu.tool.cldrtoicu.CldrDataProcessor; 27 28 import com.google.common.annotations.VisibleForTesting; 29 import com.google.common.base.Ascii; 30 import com.google.common.collect.ImmutableList; 31 import com.google.common.collect.ImmutableMap; 32 import com.google.common.collect.Sets; 33 34 /** 35 * A mapper to collect BCP-47 data from {@link CldrDataType#BCP47 BCP47} data under paths 36 * matching: 37 * <pre>{@code 38 * //ldmlBCP47/keyword/key[@name=*]/type[@name=*] 39 * }</pre> 40 */ 41 public final class Bcp47Mapper { 42 // Other attributes (e.g. "alias") are value attributes and don't need to be matched here. 43 private static final AttributeKey KEY_NAME = keyOf("key", "name"); 44 private static final AttributeKey KEY_ALIAS = keyOf("key", "alias"); 45 private static final AttributeKey KEY_VALUE_TYPE = keyOf("key", "valueType"); 46 47 private static final AttributeKey TYPE_NAME = keyOf("type", "name"); 48 private static final AttributeKey TYPE_ALIASES = keyOf("type", "alias"); 49 private static final AttributeKey PREFERRED_TYPE_NAME = keyOf("type", "preferred"); 50 51 // Deprecation of the data is not the same as deprecation of attributes themselves. This 52 // deprecation relates to identifying data which exists, but is not longer the right way to 53 // represent things (which means it can be important for clients to know about). 54 private static final AttributeKey KEY_DEPRECATED = keyOf("key", "deprecated"); 55 private static final AttributeKey TYPE_DEPRECATED = keyOf("type", "deprecated"); 56 57 // Attributes that can be emitted under the /keyInfo or /typeInfo paths for auxiliary 58 // information in the ICU data. If the value is equal to the declared default, it is ignored. 59 // NOTE: The need for hard-coded default values is a hack because there's not nice way (yet) 60 // to determine the default for implicit values via the DTD. Ideally this would be automatic 61 // and the AttributeKey class would be able to have a method like "isDefault(String value)". 62 private static final ImmutableMap<AttributeKey, String> INFO_ATTRIBUTES = 63 ImmutableMap.of(KEY_VALUE_TYPE, "", KEY_DEPRECATED, "false", TYPE_DEPRECATED, "false"); 64 65 private static final RbPath RB_KEYMAP = RbPath.of("keyMap"); 66 private static final RbPath RB_TYPE_ALIAS = RbPath.of("typeAlias", "timezone:alias"); 67 private static final RbPath RB_MAP_ALIAS = RbPath.of("typeMap", "timezone:alias"); 68 private static final RbPath RB_BCP_ALIAS = RbPath.of("bcpTypeAlias", "tz:alias"); 69 70 private static final CldrDataProcessor<Bcp47Mapper> BCP47_PROCESSOR; 71 static { 72 CldrDataProcessor.Builder<Bcp47Mapper> processor = CldrDataProcessor.builder(); 73 processor 74 .addAction("//ldmlBCP47/keyword/key[@name=*]", (m, p) -> m.new ValueCollector(p)) 75 .addValueAction("type[@name=*]", ValueCollector::collect); 76 BCP47_PROCESSOR = processor.build(); 77 } 78 79 /** 80 * Processes data from the given supplier to generate Timezone and BCP-47 ICU data. 81 * 82 * @param src the CLDR data supplier to process. 83 * @return A list of IcuData instances containing BCP-47 data to be written to files. 84 */ process(CldrDataSupplier src)85 public static ImmutableList<IcuData> process(CldrDataSupplier src) { 86 return process(src.getDataForType(BCP47)); 87 } 88 89 @VisibleForTesting // It's easier to supply a fake data instance than a fake supplier. process(CldrData cldrData)90 static ImmutableList<IcuData> process(CldrData cldrData) { 91 Bcp47Mapper mapper = BCP47_PROCESSOR.process(cldrData, new Bcp47Mapper(), DTD); 92 mapper.addKeyMapValues(); 93 return ImmutableList.of(mapper.keyTypeData, mapper.tzData); 94 } 95 96 // Outer visitor which handles "key" paths by installing sub-visitor methods to process 97 // each child "type" element. Depending on the key name, values are stored in different 98 // IcuData instances. 99 private final IcuData tzData = new IcuData("timezoneTypes", false); 100 private final IcuData keyTypeData = new IcuData("keyTypeData", false); 101 // A map collecting each key and values as they are visited. 102 // TODO: Convert this to a Map<RbPath, String> which involves removing the '@' prefix hack. 103 private Map<String, String> keyMap = new LinkedHashMap<>(); 104 Bcp47Mapper()105 private Bcp47Mapper() { } 106 107 // Post processing to add additional captured attribute values and some special cases. addKeyMapValues()108 private void addKeyMapValues() { 109 IcuData keyData = keyTypeData; 110 // Add all the keyMap values into the IcuData file. 111 for (Entry<String, String> kmData : keyMap.entrySet()) { 112 String bcpKey = kmData.getKey(); 113 String key = kmData.getValue(); 114 if (bcpKey.startsWith("@")) { 115 // Undoing the weird hack in addInfoAttributes(). This can be done better. 116 // We use "parse()" because these are full paths, and not single elements. 117 keyData.add(RbPath.parse(bcpKey.substring(1)), key); 118 continue; 119 } 120 if (bcpKey.equals(key)) { 121 // An empty value indicates that the BCP47 key is same as the legacy key. 122 bcpKey = ""; 123 } 124 keyData.add(RB_KEYMAP.extendBy(key), bcpKey); 125 } 126 // Add aliases for timezone data. 127 keyData.add(RB_TYPE_ALIAS, "/ICUDATA/timezoneTypes/typeAlias/timezone"); 128 keyData.add(RB_MAP_ALIAS, "/ICUDATA/timezoneTypes/typeMap/timezone"); 129 keyData.add(RB_BCP_ALIAS, "/ICUDATA/timezoneTypes/bcpTypeAlias/tz"); 130 } 131 132 private final class ValueCollector { 133 private final String keyName; 134 // Mutable data to be written into (differs depending on the key name). 135 private final IcuData icuData; 136 ValueCollector(CldrPath prefix)137 ValueCollector(CldrPath prefix) { 138 this.keyName = Ascii.toLowerCase(KEY_NAME.valueFrom(prefix)); 139 this.icuData = keyName.equals("tz") ? tzData : keyTypeData; 140 } 141 collect(CldrValue value)142 private void collect(CldrValue value) { 143 String typeName = TYPE_NAME.valueFrom(value); 144 // Note that if a "preferred" type exists, we treat the value specially and add 145 // it only as an alias. We expected values with a preferred replacement to 146 // always be explicitly deprecated. 147 Optional<String> prefName = PREFERRED_TYPE_NAME.optionalValueFrom(value); 148 if (prefName.isPresent()) { 149 checkState(KEY_DEPRECATED.booleanValueFrom(value, false) 150 || TYPE_DEPRECATED.booleanValueFrom(value, false), 151 "unexpected 'preferred' attribute for non-deprecated value: %s", value); 152 icuData.add(RbPath.of("bcpTypeAlias", keyName, typeName), prefName.get()); 153 return; 154 } 155 // Note: There are some deprecated values which don't have a preferred 156 // replacement and these will be processed below (in particular we need to emit 157 // the fact that they are deprecated). 158 159 // Not all key elements have an alias. E.g. in calendar.xml: 160 // <key name="fw" description="First day of week" since="28"> 161 // But we still add it as a alias to itself (which is later turned into a path with 162 // an empty value). 163 String keyAlias = toLowerCase(KEY_ALIAS.valueFrom(value, keyName)); 164 165 keyMap.put(keyName, keyAlias); 166 RbPath typeMapPrefix = RbPath.of("typeMap", keyAlias); 167 168 List<String> typeAliases = TYPE_ALIASES.listOfValuesFrom(value); 169 if (typeAliases.isEmpty()) { 170 // Generate type map entry using empty value (an empty value indicates same 171 // type name is used for both BCP47 and legacy type). 172 icuData.add(typeMapPrefix.extendBy(typeName), ""); 173 } else { 174 String mainAlias = typeAliases.get(0); 175 icuData.add(typeMapPrefix.extendBy(quoteAlias(mainAlias)), typeName); 176 // Put additional aliases as secondary aliases referencing the main alias. 177 RbPath typeAliasPrefix = RbPath.of("typeAlias", keyAlias); 178 typeAliases.stream() 179 .skip(1) 180 .map(Bcp47Mapper::quoteAlias) 181 .forEach(a -> icuData.add(typeAliasPrefix.extendBy(a), mainAlias)); 182 } 183 addInfoAttributes(keyName, typeName, value.getValueAttributes()); 184 } 185 186 // Add any additional attributes present to the attribute map. Note that this code was 187 // copied from largely undocumented code, and the precise reasoning for why this is 188 // needed or why it's done this way is not completely clear. It is very likely that it 189 // can be simplified. 190 // 191 // The '@' symbol added here is just a magic token that gets stripped off again in the 192 // addKeyMapValues() method, it appears to just be a way to distinguish keys added via 193 // this method vs during the collect method. A better approach might just be to have two 194 // maps. 195 // TODO: Remove the use of '@' and simplify the logic for "info" attributes (infoMap?). addInfoAttributes( String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes)196 private void addInfoAttributes( 197 String keyName, String typeName, ImmutableMap<AttributeKey, String> attributes) { 198 // Only emit deprecation for the "key" level, even if all types below that are also 199 // marked as deprecated. Only do this for a subset of attributes (INFO_ATTRIBUTES). 200 Set<AttributeKey> keys = 201 Sets.intersection(attributes.keySet(), INFO_ATTRIBUTES.keySet()); 202 for (AttributeKey a : keys) { 203 String value = attributes.get(a); 204 // Skip empty or default values in attributes. 205 if (value.isEmpty() || INFO_ATTRIBUTES.get(a).equals(value)) { 206 continue; 207 } 208 // The ID for the xxxInfo paths in ICU is the path fragment at which the 209 // attribute exists. Since we only process complete paths here, we must do a 210 // bit of reconstruction based on the element name of the attribute we are 211 // processing. This relies on explicit knowledge that the paths are "<key>" or 212 // "<key>/<type>". This all gets less messy if we switch to RbPath. 213 String id = 214 a.getElementName().equals("key") ? keyName : keyName + "/" + typeName; 215 keyMap.put( 216 "@" + a.getElementName() + "Info/" + a.getAttributeName() + "/" + id, 217 value); 218 } 219 } 220 } 221 222 /** 223 * Escapes alias values containing '/' so they can appear in resource bundle paths. This 224 * function replaces '/' with ':' and quotes the result (e.g. foo/bar -> "foo:bar"). 225 * 226 * <p>This is needed for timezone "metazone" ID strings which are of the form 'Foo/Bar' 227 * in the CLDR data. 228 */ 229 // TODO: Switch to RbPath and do quoting automatically when ICU data is written out. quoteAlias(String str)230 private static String quoteAlias(String str) { 231 return str.indexOf('/') == -1 ? str : '"' + str.replace('/', ':') + '"'; 232 } 233 } 234