1 package org.unicode.cldr.api; 2 3 import static com.google.common.base.Preconditions.checkArgument; 4 import static com.google.common.base.Preconditions.checkState; 5 import static java.lang.Integer.signum; 6 import static org.unicode.cldr.api.CldrDataType.LDML; 7 8 import java.util.ArrayList; 9 import java.util.Comparator; 10 import java.util.List; 11 import java.util.Map; 12 import java.util.Map.Entry; 13 import java.util.function.BiConsumer; 14 import java.util.function.Consumer; 15 import java.util.function.Predicate; 16 import java.util.stream.Stream; 17 18 import org.unicode.cldr.util.DtdData; 19 import org.unicode.cldr.util.XPathParts; 20 21 import com.google.common.collect.ImmutableMap; 22 import com.google.common.collect.ImmutableSetMultimap; 23 24 /** 25 * Utilities related to CLDR paths. It's possible that one day we might wish to expose the path 26 * comparator from this class, but if so we should key it off something other than {@code DtdType}. 27 */ 28 final class CldrPaths { 29 // Constants to make comparator logic a bit more readable (i.e. LHS < RHS ==> return -1). 30 private static final int LHS_FIRST = -1; 31 private static final int RHS_FIRST = 1; 32 33 // A synthetic attribute used by CLDR code to apply an explicit ordering to otherwise identical 34 // paths (this only happens for "ORDERED" elements). We handle this differently and have the 35 // sort index as an explicit field in CldrPath rather than treating it as an attribute. 36 private static final String HIDDEN_SORT_INDEX_ATTRIBUTE = "_q"; 37 38 // When calculating things about elements we need to ignore deprecated ones, especially when 39 // looking for "leaf" elements, since CLDR data used to allow mixed content and the DTD still 40 // has elements with both data and (deprecated) child elements present. 41 private static final Predicate<DtdData.Element> IS_NOT_DEPRECATED = e -> !e.isDeprecated(); 42 43 // A map of the leaf element names for each supported DTD. 44 private static final ImmutableSetMultimap<CldrDataType, String> LEAF_ELEMENTS_MAP; 45 // A map of the ordered element names for each supported DTD. 46 private static final ImmutableSetMultimap<CldrDataType, String> ORDERED_ELEMENTS_MAP; 47 static { 48 ImmutableSetMultimap.Builder<CldrDataType, String> leafElementsMap = 49 ImmutableSetMultimap.builder(); 50 ImmutableSetMultimap.Builder<CldrDataType, String> orderedElementsMap = 51 ImmutableSetMultimap.builder(); 52 for (CldrDataType type : CldrDataType.values()) { 53 // While at happened to be true (at the time of writing) that the getElements() method 54 // returns a new, mutable set, this is completely undocumented so we cannot rely on it 55 // (otherwise we could just do "removeIf(...)"). 56 // 57 // Note that while the "specials" element has no children in the DTD, it is not 58 // considered a "leaf" element as it is expected to contain any additional elements 59 // from unknown namespaces (e.g. "icu:"). 60 // 61 // We know that Guava's collection classes have a policy of never iterating over a 62 // collection more than once, so it's safe to use the ::iterator trick to convert a 63 // stream into a one-shot iterable (saves have to make temporary collections). leafElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) .map(DtdData.Element::getName) .filter(e -> !e.equals("special")) ::iterator)64 leafElementsMap.putAll( 65 type, 66 type.getElements() 67 .filter(IS_NOT_DEPRECATED) 68 // NOTE: Some leaf elements still have deprecated children (from when mixed 69 // data was permitted). 70 .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) 71 .map(DtdData.Element::getName) 72 .filter(e -> !e.equals("special")) 73 ::iterator); orderedElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(DtdData.Element::isOrdered) .map(DtdData.Element::getName) ::iterator)74 orderedElementsMap.putAll( 75 type, 76 type.getElements() 77 .filter(IS_NOT_DEPRECATED) 78 .filter(DtdData.Element::isOrdered) 79 .map(DtdData.Element::getName) 80 ::iterator); 81 } 82 // Special case "alias" is an alternate leaf element for a lot of LDML elements. leafElementsMap.put(LDML, "alias")83 leafElementsMap.put(LDML, "alias"); 84 LEAF_ELEMENTS_MAP = leafElementsMap.build(); 85 ORDERED_ELEMENTS_MAP = orderedElementsMap.build(); 86 } 87 88 // A map of path comparators for each supported DTD. 89 private static final ImmutableMap<CldrDataType, DtdPathComparator> PATH_COMPARATOR_MAP; 90 91 // This static block must come after the attribute map is created since it uses it. 92 static { 93 ImmutableMap.Builder<CldrDataType, DtdPathComparator> pathMap = ImmutableMap.builder(); 94 for (CldrDataType type : CldrDataType.values()) { pathMap.put(type, new DtdPathComparator(type))95 pathMap.put(type, new DtdPathComparator(type)); 96 } 97 PATH_COMPARATOR_MAP = pathMap.build(); 98 } 99 100 /** 101 * Returns a comparator for {@link CldrPath}s which is compatible with the canonical path 102 * ordering for the given DTD instance (e.g. {@link DtdData#getDtdComparator}()). 103 */ 104 // TODO: Add tests to ensure it continues to agree to DTD ordering. getPathComparator(CldrDataType type)105 static Comparator<CldrPath> getPathComparator(CldrDataType type) { 106 return PATH_COMPARATOR_MAP.get(type); 107 } 108 109 private static final class DtdPathComparator implements Comparator<CldrPath> { 110 private final Comparator<String> elementNameComparator; 111 private final Comparator<String> attributeNameComparator; 112 DtdPathComparator(CldrDataType dataType)113 private DtdPathComparator(CldrDataType dataType) { 114 this.elementNameComparator = dataType.getElementComparator(); 115 this.attributeNameComparator = dataType.getAttributeComparator(); 116 } 117 118 // This code should only return "signum" values for ordering (i.e. {-1, 0, 1}). 119 @Override compare(CldrPath lhs, CldrPath rhs)120 public int compare(CldrPath lhs, CldrPath rhs) { 121 int length = lhs.getLength(); 122 if (length == rhs.getLength()) { 123 if (length == 1) { 124 // Root nodes are special as they define the DTD type and must always be equal. 125 // Paths with different types can be compared, but not via this comparator. 126 checkState(lhs.getName().equals(rhs.getName()), 127 "cannot compare paths with different DTD type: %s / %s", lhs, rhs); 128 return 0; 129 } 130 // Compare parent paths first and return if they give an ordering. 131 int signum = compare(lhs.getParent(), rhs.getParent()); 132 return (signum != 0) ? signum : compareCurrentElement(lhs, rhs); 133 } else if (length < rhs.getLength()) { 134 // Recursively shorten the RHS path until it's the same length. 135 int signum = compare(lhs, rhs.getParent()); 136 // If the LHS is equal to the RHS parent, then the (shorter) LHS path comes first. 137 // Note this this can only happen if we are comparing non-leaf node paths. 138 return signum != 0 ? signum : LHS_FIRST; 139 } else { 140 // Flip the comparison if LHS was longer (we do this at most once per comparison). 141 return -compare(rhs, lhs); 142 } 143 } 144 compareCurrentElement(CldrPath lhs, CldrPath rhs)145 private int compareCurrentElement(CldrPath lhs, CldrPath rhs) { 146 String elementName = lhs.getName(); 147 int signum = signum(elementNameComparator.compare(elementName, rhs.getName())); 148 if (signum != 0) { 149 return signum; 150 } 151 // Primary order within a path element is defined by the element index (this is a 152 // bit of a special value used only for "ORDERED" elements in the DTD. This always 153 // trumps any attribute ordering but is almost always -1. 154 signum = Integer.compare(lhs.getSortIndex(), rhs.getSortIndex()); 155 if (signum != 0) { 156 return signum; 157 } 158 // Element name is the same, so test attributes. Attributes are already known to be 159 // ordered by the element's DTD order, so we only need to find and compare the first 160 // difference. 161 int minAttributeCount = 162 Math.min(lhs.getAttributeCount(), rhs.getAttributeCount()); 163 for (int n = 0; n < minAttributeCount && signum == 0; n++) { 164 String attributeName = lhs.getLocalAttributeName(n); 165 // Important: We negate the comparison result here because we want elements with 166 // "missing" attributes to sort earlier. 167 // 168 // E.g. for two elements LHS="foo[a=x][b=y]" and RHS="foo[b=y]" we want to say 169 // that "RHS < LHS" because RHS is "missing" the attribute "[a=x]". But when we 170 // compare the first attributes we find "a" (LHS) < "b" (RHS), which is the 171 // opposite of what we want. This is because while LHS has a lower ordered 172 // attribute, that indicates that RHS is missing that attribute in the same 173 // position, which should make RHS sort first. 174 signum = -signum( 175 attributeNameComparator.compare(attributeName, rhs.getLocalAttributeName(n))); 176 if (signum == 0) { 177 // Attribute names equal, so test attribute value. 178 signum = signum( 179 DtdData.getAttributeValueComparator(elementName, attributeName) 180 .compare(lhs.getLocalAttributeValue(n), rhs.getLocalAttributeValue(n))); 181 } 182 } 183 if (signum == 0) { 184 // Attributes match up to the minimum length, but one element might have more. 185 // Elements with more attributes sort _after_ those without. 186 if (lhs.getAttributeCount() > minAttributeCount) { 187 signum = RHS_FIRST; 188 } else if (rhs.getAttributeCount() > minAttributeCount) { 189 signum = LHS_FIRST; 190 } 191 } 192 return signum; 193 } 194 } 195 196 /** Returns whether this path is a full path ending in a "leaf" element. */ isLeafPath(CldrPath path)197 static boolean isLeafPath(CldrPath path) { 198 String lastElementName = path.getName(); 199 return lastElementName.indexOf(':') != -1 200 || LEAF_ELEMENTS_MAP.get(path.getDataType()).contains(lastElementName); 201 } 202 203 /** 204 * Returns whether an element is "ORDERED" in the specified DTD (and should have an explicit 205 * sort index). 206 */ isOrdered(CldrDataType dtdType, String elementName)207 static boolean isOrdered(CldrDataType dtdType, String elementName) { 208 // Elements with namespaces unknown to the DTD are never ordered 209 // (but it knows about icu: elements). 210 if (elementName.indexOf(':') != -1 && !elementName.startsWith("icu:")) { 211 return false; 212 } 213 // Returns empty set if DTD unknown, but it might also be the case that a valid DTD has no 214 // ordered elements, so we can't reasonable check for anything here. 215 return ORDERED_ELEMENTS_MAP.get(dtdType).contains(elementName); 216 } 217 218 // This can't go further up due to static initialization ordering issues. 219 // TODO: Move all reading of DTDs and setup for paths into a lazy holder. 220 private static final CldrPath LDML_VERSION = 221 CldrPath.parseDistinguishingPath("//ldml/identity/version"); 222 223 /** 224 * Returns whether this path should be emitted by a data supplier. New cases can be added 225 * as needed. 226 */ shouldEmit(CldrPath path)227 static boolean shouldEmit(CldrPath path) { 228 // CLDRFile seems to do some interesting things with the version based on the DTD in 229 // which we see: 230 // 231 // <!ATTLIST version number CDATA #REQUIRED > 232 // <!--@MATCH:regex/\$Revision.*\$--> 233 // <!--@METADATA--> 234 // 235 // <!ATTLIST version cldrVersion CDATA #FIXED "36" > 236 // <!--@MATCH:any--> 237 // <!--@VALUE--> 238 // 239 // This results in conflict between values obtained via CLDRFile and those obtained 240 // directly by parsing an LDML XML file (which happens for "special" XML files in ICU). 241 // 242 // So for now I've decided to just ignore the version (since it's hardly used in the 243 // ICU converter code and always available via CldrDataSupplier#getCldrVersionString(). 244 245 // Hacky way to detect the version declaration in LDML files (which must always be 246 // skipped since they are duplicate paths and reveal XML file boundaries). The path is 247 // always "//ldmlXxxx/version" for some DTD type. 248 if (path.getLength() == 2 && path.getName().equals("version")) { 249 return false; 250 } 251 252 // Note that there is a need for some kind of versioning for some bits of data (since 253 // changes to things like collation can invalidate database search indexes) but this should 254 // be handled directly in the logic which builds the ICU data and isn't strictly the same 255 // as the LDML version anyway. 256 // 257 // TODO: Remove this code if and when LDML version strings are removed. 258 return !path.equals(LDML_VERSION); 259 } 260 261 /** 262 * Processes a full path to extract a distinguishing CldrPath and handle any value attributes 263 * present. This is designed for iterating over successive paths from CLDRFile, but can be used 264 * to create paths in isolation if necessary. 265 * 266 * @param fullPath a parsed full path. 267 * @param previousElements an optional list of previous CldrPath elements which will be used 268 * as the prefix to the new path wherever possible (e.g. if previousElements="(a,b,c,d)" 269 * and "fullPath=a/b/x/y/z", then the new path will share the path prefix up to and 270 * including 'b'). When processing sorted paths, this will greatly reduce allocations. 271 * @param valueAttributeCollector a collector into which value attributes will be added (in 272 * DTD order). 273 * @return a new CldrPath corresponding to the distinguishing path of {@code fullPath}. 274 * @throws IllegalArgumentException if the path string is invalid. 275 */ processXPath( String fullPath, List<CldrPath> previousElements, BiConsumer<AttributeKey, String> valueAttributeCollector)276 static CldrPath processXPath( 277 String fullPath, 278 List<CldrPath> previousElements, 279 BiConsumer<AttributeKey, String> valueAttributeCollector) { 280 checkArgument(!fullPath.isEmpty(), "path must not be empty"); 281 // This fails if attribute names are invalid, but not if element names are invalid, and we 282 // want to control/stabalize the error messages in this API. 283 XPathParts pathParts; 284 try { 285 pathParts = XPathParts.getFrozenInstance(fullPath); 286 } catch (IllegalArgumentException e) { 287 throw new IllegalArgumentException("invalid path: " + fullPath); 288 } 289 int length = pathParts.size(); 290 checkArgument(length > 0, "cldr path must not be empty: %s", pathParts); 291 DtdData dtd = pathParts.getDtdData(); 292 checkArgument(dtd != null, "unknown DTD type: %s", pathParts); 293 CldrDataType dtdType = CldrDataType.forRawType(dtd.dtdType); 294 295 // The path we're returning, constructed from top-to-bottom. 296 CldrPath path = null; 297 // Reusable key/value attributes list. 298 List<String> keyValuePairs = new ArrayList<>(); 299 Consumer<Entry<String, String>> collectElementAttribute = e -> { 300 keyValuePairs.add(e.getKey()); 301 keyValuePairs.add(e.getValue()); 302 }; 303 // True once we've started to create a new path rather than reusing previous elements. 304 boolean diverged = false; 305 for (int n = 0; n < length; n++) { 306 String elementName = pathParts.getElement(n); 307 308 // If this path is from CldrPath.toString() then the sort index is encoded in the 309 // element name suffix (e.g. foo#42[@bar="x"]), otherwise it's in the synthetic "_q" 310 // attribute. Most often there is no sort index however, so this code should optimize 311 // to the null case. 312 int sortIndex = -1; 313 Map<String, String> attributes = pathParts.getAttributes(n); 314 int nameEnd = elementName.indexOf('#'); 315 if (nameEnd != -1) { 316 sortIndex = Integer.parseUnsignedInt(elementName.substring(nameEnd + 1)); 317 elementName = elementName.substring(0, nameEnd); 318 } else { 319 String sortIndexStr = attributes.get(HIDDEN_SORT_INDEX_ATTRIBUTE); 320 if (sortIndexStr != null) { 321 sortIndex = Integer.parseUnsignedInt(sortIndexStr); 322 } 323 } 324 325 // Note that element names need not be known to the DTD. If an element's name is 326 // prefixed with an unknown namespace (e.g. "icu:") then it is always permitted. 327 // Similarly, we never filter out attributes in unknown namespaces. For now we assume 328 // that any explicit namespace is unknown. 329 boolean hasNamespace = elementName.indexOf(':') != -1; 330 checkArgument(hasNamespace || dtd.getElementFromName().containsKey(elementName), 331 "invalid path: %s", fullPath); 332 333 // The keyValuePairs list is used by the collectElementAttribute callback but we don't 334 // want/need to make a new callback each time, so just clear the underlying list. 335 keyValuePairs.clear(); 336 processPathAttributes( 337 elementName, attributes, dtdType, collectElementAttribute, valueAttributeCollector); 338 339 // WARNING: We cannot just get the draft attribute value from the attributes map (as 340 // would be expected) because it throws an exception. This is because you can only 341 // "get()" values from the attributes map if they are potentially valid attributes for 342 // that element. Unfortunately this is due to a deliberate choice in the implementation 343 // of MapComparator, which explicitly throws an exception if an unknown attribute is 344 // encountered. Thus we have to check that "draft" is a possible attribute before 345 // asking for its value. 346 // 347 // TODO(dbeaumont): Fix this properly, ideally by fixing the comparator to not throw. 348 CldrDraftStatus draftStatus = dtd.getAttribute(elementName, "draft") != null 349 ? CldrDraftStatus.forString(attributes.get("draft")) : null; 350 351 if (!diverged && n < previousElements.size()) { 352 CldrPath p = previousElements.get(n); 353 if (p.matchesContent(elementName, sortIndex, keyValuePairs, draftStatus)) { 354 // The previous path we processed was the same at least down to this depth, so 355 // we can just reuse that element instead of creating a new one. 356 // 357 // In tests over the resolved paths for "en_GB" in DTD order, there are ~128k 358 // path elements processed, with ~103k being reused here and ~25k being newly 359 // allocated (a > 80% reuse rate). However this works best when the incoming 360 // paths are sorted, since otherwise the "previous" path is random. 361 // 362 // For unsorted paths, the reuse rate is reduced to approximately 25%. This is 363 // still saving ~32k allocations for the "en_GB" example, and it still saved 364 // about 35% in terms of measured time to process the paths. 365 path = p; 366 continue; 367 } 368 } 369 // This path has diverged from the previous path, so we must start making new elements. 370 path = new CldrPath(path, elementName, keyValuePairs, dtdType, draftStatus, sortIndex); 371 diverged = true; 372 } 373 return path; 374 } 375 376 // Returns the element's sort index (or -1 if not present). processPathAttributes( String elementName, Map<String, String> attributeMap, CldrDataType dtdType, Consumer<Entry<String, String>> collectElementAttribute, BiConsumer<AttributeKey, String> collectValueAttribute)377 static void processPathAttributes( 378 String elementName, 379 /* @Nullable */ Map<String, String> attributeMap, 380 CldrDataType dtdType, 381 Consumer<Entry<String, String>> collectElementAttribute, 382 BiConsumer<AttributeKey, String> collectValueAttribute) { 383 384 // Why not just a Map? Maps don't efficiently handle "get the Nth element" which is 385 // something that's used a lot in the ICU conversion code. Distinguishing attributes in 386 // paths are used more as a "list of pairs" than a map with key/value lookup. Even 387 // extensions like "NavigableMap" don't give you what you want. 388 // 389 // This does mean that lookup-by-name is a linear search (unless you want to pay the cost 390 // of also having a map here) but the average number of attributes is very small (<3). 391 if (attributeMap != null && !attributeMap.isEmpty()) { 392 processAttributes( 393 attributeMap.entrySet().stream(), elementName, collectValueAttribute, dtdType) 394 .forEach(collectElementAttribute); 395 } 396 } 397 processAttributes( Stream<Entry<String, String>> in, String elementName, BiConsumer<AttributeKey, String> collectValueAttribute, CldrDataType dtdType)398 static Stream<Entry<String, String>> processAttributes( 399 Stream<Entry<String, String>> in, 400 String elementName, 401 BiConsumer<AttributeKey, String> collectValueAttribute, 402 CldrDataType dtdType) { 403 Consumer<Entry<String, String>> collectValueAttributes = e -> { 404 if (dtdType.isValueAttribute(elementName, e.getKey())) { 405 collectValueAttribute.accept(AttributeKey.keyOf(elementName, e.getKey()), e.getValue()); 406 } 407 }; 408 return in 409 // Special case of a synthetic distinguishing attribute that's _not_ in the DTD, and 410 // should be ignored. 411 .filter(e -> !e.getKey().equals(HIDDEN_SORT_INDEX_ATTRIBUTE)) 412 .peek(collectValueAttributes) 413 .filter(e -> dtdType.isDistinguishingAttribute(elementName, e.getKey())); 414 } 415 CldrPaths()416 private CldrPaths() {} 417 } 418