1 package org.unicode.cldr.api; 2 3 import static com.google.common.base.Preconditions.checkArgument; 4 import static com.google.common.base.Preconditions.checkState; 5 import static java.lang.Integer.signum; 6 import static org.unicode.cldr.api.CldrDataType.LDML; 7 8 import java.util.ArrayList; 9 import java.util.Comparator; 10 import java.util.List; 11 import java.util.Map; 12 import java.util.Map.Entry; 13 import java.util.function.BiConsumer; 14 import java.util.function.Consumer; 15 import java.util.function.Predicate; 16 import java.util.stream.Stream; 17 18 import org.unicode.cldr.util.DtdData; 19 import org.unicode.cldr.util.XPathParts; 20 21 import com.google.common.collect.ImmutableMap; 22 import com.google.common.collect.ImmutableSetMultimap; 23 24 /** 25 * Utilities related to CLDR paths. It's possible that one day we might wish to expose the path 26 * comparator from this class, but if so we should key it off something other than {@code DtdType}. 27 */ 28 final class CldrPaths { 29 // Constants to make comparator logic a bit more readable (i.e. LHS < RHS ==> return -1). 30 private static final int LHS_FIRST = -1; 31 private static final int RHS_FIRST = 1; 32 33 // A synthetic attribute used by CLDR code to apply an explicit ordering to otherwise identical 34 // paths (this only happens for "ORDERED" elements). We handle this differently and have the 35 // sort index as an explicit field in CldrPath rather than treating it as an attribute. 36 private static final String HIDDEN_SORT_INDEX_ATTRIBUTE = "_q"; 37 38 // When calculating things about elements we need to ignore deprecated ones, especially when 39 // looking for "leaf" elements, since CLDR data used to allow mixed content and the DTD still 40 // has elements with both data and (deprecated) child elements present. 41 private static final Predicate<DtdData.Element> IS_NOT_DEPRECATED = e -> !e.isDeprecated(); 42 43 // A map of the leaf element names for each supported DTD. 44 private static final ImmutableSetMultimap<CldrDataType, String> LEAF_ELEMENTS_MAP; 45 // A map of the ordered element names for each supported DTD. 46 private static final ImmutableSetMultimap<CldrDataType, String> ORDERED_ELEMENTS_MAP; 47 static { 48 ImmutableSetMultimap.Builder<CldrDataType, String> leafElementsMap = 49 ImmutableSetMultimap.builder(); 50 ImmutableSetMultimap.Builder<CldrDataType, String> orderedElementsMap = 51 ImmutableSetMultimap.builder(); 52 for (CldrDataType type : CldrDataType.values()) { 53 // While at happened to be true (at the time of writing) that the getElements() method 54 // returns a new, mutable set, this is completely undocumented so we cannot rely on it 55 // (otherwise we could just do "removeIf(...)"). 56 // 57 // Note that while the "specials" element has no children in the DTD, it is not 58 // considered a "leaf" element as it is expected to contain any additional elements 59 // from unknown namespaces (e.g. "icu:"). 60 // 61 // We know that Guava's collection classes have a policy of never iterating over a 62 // collection more than once, so it's safe to use the ::iterator trick to convert a 63 // stream into a one-shot iterable (saves have to make temporary collections). leafElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) .map(DtdData.Element::getName) .filter(e -> !e.equals("special")) ::iterator)64 leafElementsMap.putAll( 65 type, 66 type.getElements() 67 .filter(IS_NOT_DEPRECATED) 68 // NOTE: Some leaf elements still have deprecated children (from when mixed 69 // data was permitted). 70 .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) 71 .map(DtdData.Element::getName) 72 .filter(e -> !e.equals("special")) 73 ::iterator); orderedElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(DtdData.Element::isOrdered) .map(DtdData.Element::getName) ::iterator)74 orderedElementsMap.putAll( 75 type, 76 type.getElements() 77 .filter(IS_NOT_DEPRECATED) 78 .filter(DtdData.Element::isOrdered) 79 .map(DtdData.Element::getName) 80 ::iterator); 81 } 82 // Special case "alias" is an alternate leaf element for a lot of LDML elements. leafElementsMap.put(LDML, "alias")83 leafElementsMap.put(LDML, "alias"); 84 LEAF_ELEMENTS_MAP = leafElementsMap.build(); 85 ORDERED_ELEMENTS_MAP = orderedElementsMap.build(); 86 } 87 88 // A map of path comparators for each supported DTD. 89 private static final ImmutableMap<CldrDataType, DtdPathComparator> PATH_COMPARATOR_MAP; 90 91 // This static block must come after the attribute map is created since it uses it. 92 static { 93 ImmutableMap.Builder<CldrDataType, DtdPathComparator> pathMap = ImmutableMap.builder(); 94 for (CldrDataType type : CldrDataType.values()) { pathMap.put(type, new DtdPathComparator(type))95 pathMap.put(type, new DtdPathComparator(type)); 96 } 97 PATH_COMPARATOR_MAP = pathMap.build(); 98 } 99 100 /** 101 * Returns a comparator for {@link CldrPath}s which is compatible with the canonical path 102 * ordering for the given DTD instance (e.g. {@link DtdData#getDtdComparator}()). 103 */ 104 // TODO: Add tests to ensure it continues to agree to DTD ordering. getPathComparator(CldrDataType type)105 static Comparator<CldrPath> getPathComparator(CldrDataType type) { 106 return PATH_COMPARATOR_MAP.get(type); 107 } 108 109 private static final class DtdPathComparator implements Comparator<CldrPath> { 110 private final Comparator<String> elementNameComparator; 111 private final Comparator<String> attributeNameComparator; 112 DtdPathComparator(CldrDataType dataType)113 private DtdPathComparator(CldrDataType dataType) { 114 this.elementNameComparator = dataType.getElementComparator(); 115 this.attributeNameComparator = dataType.getAttributeComparator(); 116 } 117 118 // This code should only return "signum" values for ordering (i.e. {-1, 0, 1}). 119 @Override compare(CldrPath lhs, CldrPath rhs)120 public int compare(CldrPath lhs, CldrPath rhs) { 121 int length = lhs.getLength(); 122 if (length == rhs.getLength()) { 123 if (length == 1) { 124 // Root nodes are special as they define the DTD type and must always be equal. 125 // Paths with different types can be compared, but not via this comparator. 126 checkState(lhs.getName().equals(rhs.getName()), 127 "cannot compare paths with different DTD type: %s / %s", lhs, rhs); 128 return 0; 129 } 130 // Compare parent paths first and return if they give an ordering. 131 int signum = compare(lhs.getParent(), rhs.getParent()); 132 return (signum != 0) ? signum : compareCurrentElement(lhs, rhs); 133 } else if (length < rhs.getLength()) { 134 // Recursively shorten the RHS path until it's the same length. 135 int signum = compare(lhs, rhs.getParent()); 136 // If the LHS is equal to the RHS parent, then the (shorter) LHS path comes first. 137 // Note this this can only happen if we are comparing non-leaf node paths. 138 return signum != 0 ? signum : LHS_FIRST; 139 } else { 140 // Flip the comparison if LHS was longer (we do this at most once per comparison). 141 return -compare(rhs, lhs); 142 } 143 } 144 compareCurrentElement(CldrPath lhs, CldrPath rhs)145 private int compareCurrentElement(CldrPath lhs, CldrPath rhs) { 146 String elementName = lhs.getName(); 147 int signum = signum(elementNameComparator.compare(elementName, rhs.getName())); 148 if (signum != 0) { 149 return signum; 150 } 151 // Primary order within a path element is defined by the element index (this is a 152 // bit of a special value used only for "ORDERED" elements in the DTD. This always 153 // trumps any attribute ordering but is almost always -1. 154 signum = Integer.compare(lhs.getSortIndex(), rhs.getSortIndex()); 155 if (signum != 0) { 156 return signum; 157 } 158 // Element name is the same, so test attributes. Attributes are already known to be 159 // ordered by the element's DTD order, so we only need to find and compare the first 160 // difference. 161 int minAttributeCount = 162 Math.min(lhs.getAttributeCount(), rhs.getAttributeCount()); 163 for (int n = 0; n < minAttributeCount && signum == 0; n++) { 164 String attributeName = lhs.getLocalAttributeName(n); 165 // Important: We negate the comparison result here because we want elements with 166 // "missing" attributes to sort earlier. 167 // 168 // E.g. for two elements LHS="foo[a=x][b=y]" and RHS="foo[b=y]" we want to say 169 // that "RHS < LHS" because RHS is "missing" the attribute "[a=x]". But when we 170 // compare the first attributes we find "a" (LHS) < "b" (RHS), which is the 171 // opposite of what we want. This is because while LHS has a lower ordered 172 // attribute, that indicates that RHS is missing that attribute in the same 173 // position, which should make RHS sort first. 174 signum = -signum( 175 attributeNameComparator.compare(attributeName, rhs.getLocalAttributeName(n))); 176 if (signum == 0) { 177 // Attribute names equal, so test attribute value. 178 signum = signum( 179 DtdData.getAttributeValueComparator(elementName, attributeName) 180 .compare(lhs.getLocalAttributeValue(n), rhs.getLocalAttributeValue(n))); 181 } 182 } 183 if (signum == 0) { 184 // Attributes match up to the minimum length, but one element might have more. 185 // Elements with more attributes sort _after_ those without. 186 if (lhs.getAttributeCount() > minAttributeCount) { 187 signum = RHS_FIRST; 188 } else if (rhs.getAttributeCount() > minAttributeCount) { 189 signum = LHS_FIRST; 190 } 191 } 192 return signum; 193 } 194 } 195 196 /** Returns whether this path is a full path ending in a "leaf" element. */ isLeafPath(CldrPath path)197 static boolean isLeafPath(CldrPath path) { 198 String lastElementName = path.getName(); 199 return lastElementName.indexOf(':') != -1 200 || LEAF_ELEMENTS_MAP.get(path.getDataType()).contains(lastElementName); 201 } 202 203 /** 204 * Returns whether an element is "ORDERED" in the specified DTD (and should have an explicit 205 * sort index). 206 */ isOrdered(CldrDataType dtdType, String elementName)207 static boolean isOrdered(CldrDataType dtdType, String elementName) { 208 // Elements with namespaces unknown the the DTD are never ordered. 209 if (elementName.indexOf(':') != -1) { 210 return false; 211 } 212 // Returns empty set if DTD unknown, but it might also be the case that a valid DTD has no 213 // ordered elements, so we can't reasonable check for anything here. 214 return ORDERED_ELEMENTS_MAP.get(dtdType).contains(elementName); 215 } 216 217 // This can't go further up due to static initialization ordering issues. 218 // TODO: Move all reading of DTDs and setup for paths into a lazy holder. 219 private static final CldrPath LDML_VERSION = 220 CldrPath.parseDistinguishingPath("//ldml/identity/version"); 221 222 /** 223 * Returns whether this path should be emitted by a data supplier. New cases can be added 224 * as needed. 225 */ shouldEmit(CldrPath path)226 static boolean shouldEmit(CldrPath path) { 227 // CLDRFile seems to do some interesting things with the version based on the DTD in 228 // which we see: 229 // 230 // <!ATTLIST version number CDATA #REQUIRED > 231 // <!--@MATCH:regex/\$Revision.*\$--> 232 // <!--@METADATA--> 233 // 234 // <!ATTLIST version cldrVersion CDATA #FIXED "36" > 235 // <!--@MATCH:any--> 236 // <!--@VALUE--> 237 // 238 // This results in conflict between values obtained via CLDRFile and those obtained 239 // directly by parsing an LDML XML file (which happens for "special" XML files in ICU). 240 // 241 // So for now I've decided to just ignore the version (since it's hardly used in the 242 // ICU converter code and always available via CldrDataSupplier#getCldrVersionString(). 243 244 // Hacky way to detect the version declaration in LDML files (which must always be 245 // skipped since they are duplicate paths and reveal XML file boundaries). The path is 246 // always "//ldmlXxxx/version" for some DTD type. 247 if (path.getLength() == 2 && path.getName().equals("version")) { 248 return false; 249 } 250 251 // Note that there is a need for some kind of versioning for some bits of data (since 252 // changes to things like collation can invalidate database search indexes) but this should 253 // be handled directly in the logic which builds the ICU data and isn't strictly the same 254 // as the LDML version anyway. 255 // 256 // TODO: Remove this code if and when LDML version strings are removed. 257 return !path.equals(LDML_VERSION); 258 } 259 260 /** 261 * Processes a full path to extract a distinguishing CldrPath and handle any value attributes 262 * present. This is designed for iterating over successive paths from CLDRFile, but can be used 263 * to create paths in isolation if necessary. 264 * 265 * @param fullPath a parsed full path. 266 * @param previousElements an optional list of previous CldrPath elements which will be used 267 * as the prefix to the new path wherever possible (e.g. if previousElements="(a,b,c,d)" 268 * and "fullPath=a/b/x/y/z", then the new path will share the path prefix up to and 269 * including 'b'). When processing sorted paths, this will greatly reduce allocations. 270 * @param valueAttributeCollector a collector into which value attributes will be added (in 271 * DTD order). 272 * @return a new CldrPath corresponding to the distinguishing path of {@code fullPath}. 273 * @throws IllegalArgumentException if the path string is invalid. 274 */ processXPath( String fullPath, List<CldrPath> previousElements, BiConsumer<AttributeKey, String> valueAttributeCollector)275 static CldrPath processXPath( 276 String fullPath, 277 List<CldrPath> previousElements, 278 BiConsumer<AttributeKey, String> valueAttributeCollector) { 279 checkArgument(!fullPath.isEmpty(), "path must not be empty"); 280 // This fails if attribute names are invalid, but not if element names are invalid, and we 281 // want to control/stabalize the error messages in this API. 282 XPathParts pathParts; 283 try { 284 pathParts = XPathParts.getFrozenInstance(fullPath); 285 } catch (IllegalArgumentException e) { 286 throw new IllegalArgumentException("invalid path: " + fullPath); 287 } 288 int length = pathParts.size(); 289 checkArgument(length > 0, "cldr path must not be empty: %s", pathParts); 290 DtdData dtd = pathParts.getDtdData(); 291 checkArgument(dtd != null, "unknown DTD type: %s", pathParts); 292 CldrDataType dtdType = CldrDataType.forRawType(dtd.dtdType); 293 294 // The path we're returning, constructed from top-to-bottom. 295 CldrPath path = null; 296 // Reusable key/value attributes list. 297 List<String> keyValuePairs = new ArrayList<>(); 298 Consumer<Entry<String, String>> collectElementAttribute = e -> { 299 keyValuePairs.add(e.getKey()); 300 keyValuePairs.add(e.getValue()); 301 }; 302 // True once we've started to create a new path rather than reusing previous elements. 303 boolean diverged = false; 304 for (int n = 0; n < length; n++) { 305 String elementName = pathParts.getElement(n); 306 307 // If this path is from CldrPath.toString() then the sort index is encoded in the 308 // element name suffix (e.g. foo#42[@bar="x"]), otherwise it's in the synthetic "_q" 309 // attribute. Most often there is no sort index however, so this code should optimize 310 // to the null case. 311 int sortIndex = -1; 312 Map<String, String> attributes = pathParts.getAttributes(n); 313 int nameEnd = elementName.indexOf('#'); 314 if (nameEnd != -1) { 315 sortIndex = Integer.parseUnsignedInt(elementName.substring(nameEnd + 1)); 316 elementName = elementName.substring(0, nameEnd); 317 } else { 318 String sortIndexStr = attributes.get(HIDDEN_SORT_INDEX_ATTRIBUTE); 319 if (sortIndexStr != null) { 320 sortIndex = Integer.parseUnsignedInt(sortIndexStr); 321 } 322 } 323 324 // Note that element names need not be known to the DTD. If an element's name is 325 // prefixed with an unknown namespace (e.g. "icu:") then it is always permitted. 326 // Similarly, we never filter out attributes in unknown namespaces. For now we assume 327 // that any explicit namespace is unknown. 328 boolean hasNamespace = elementName.indexOf(':') != -1; 329 checkArgument(hasNamespace || dtd.getElementFromName().containsKey(elementName), 330 "invalid path: %s", fullPath); 331 332 // The keyValuePairs list is used by the collectElementAttribute callback but we don't 333 // want/need to make a new callback each time, so just clear the underlying list. 334 keyValuePairs.clear(); 335 processPathAttributes( 336 elementName, attributes, dtdType, collectElementAttribute, valueAttributeCollector); 337 338 // WARNING: We cannot just get the draft attribute value from the attributes map (as 339 // would be expected) because it throws an exception. This is because you can only 340 // "get()" values from the attributes map if they are potentially valid attributes for 341 // that element. Unfortunately this is due to a deliberate choice in the implementation 342 // of MapComparator, which explicitly throws an exception if an unknown attribute is 343 // encountered. Thus we have to check that "draft" is a possible attribute before 344 // asking for its value. 345 // 346 // TODO(dbeaumont): Fix this properly, ideally by fixing the comparator to not throw. 347 CldrDraftStatus draftStatus = dtd.getAttribute(elementName, "draft") != null 348 ? CldrDraftStatus.forString(attributes.get("draft")) : null; 349 350 if (!diverged && n < previousElements.size()) { 351 CldrPath p = previousElements.get(n); 352 if (p.matchesContent(elementName, sortIndex, keyValuePairs, draftStatus)) { 353 // The previous path we processed was the same at least down to this depth, so 354 // we can just reuse that element instead of creating a new one. 355 // 356 // In tests over the resolved paths for "en_GB" in DTD order, there are ~128k 357 // path elements processed, with ~103k being reused here and ~25k being newly 358 // allocated (a > 80% reuse rate). However this works best when the incoming 359 // paths are sorted, since otherwise the "previous" path is random. 360 // 361 // For unsorted paths, the reuse rate is reduced to approximately 25%. This is 362 // still saving ~32k allocations for the "en_GB" example, and it still saved 363 // about 35% in terms of measured time to process the paths. 364 path = p; 365 continue; 366 } 367 } 368 // This path has diverged from the previous path, so we must start making new elements. 369 path = new CldrPath(path, elementName, keyValuePairs, dtdType, draftStatus, sortIndex); 370 diverged = true; 371 } 372 return path; 373 } 374 375 // Returns the element's sort index (or -1 if not present). processPathAttributes( String elementName, Map<String, String> attributeMap, CldrDataType dtdType, Consumer<Entry<String, String>> collectElementAttribute, BiConsumer<AttributeKey, String> collectValueAttribute)376 static void processPathAttributes( 377 String elementName, 378 /* @Nullable */ Map<String, String> attributeMap, 379 CldrDataType dtdType, 380 Consumer<Entry<String, String>> collectElementAttribute, 381 BiConsumer<AttributeKey, String> collectValueAttribute) { 382 383 // Why not just a Map? Maps don't efficiently handle "get the Nth element" which is 384 // something that's used a lot in the ICU conversion code. Distinguishing attributes in 385 // paths are used more as a "list of pairs" than a map with key/value lookup. Even 386 // extensions like "NavigableMap" don't give you what you want. 387 // 388 // This does mean that lookup-by-name is a linear search (unless you want to pay the cost 389 // of also having a map here) but the average number of attributes is very small (<3). 390 if (attributeMap != null && !attributeMap.isEmpty()) { 391 processAttributes( 392 attributeMap.entrySet().stream(), elementName, collectValueAttribute, dtdType) 393 .forEach(collectElementAttribute); 394 } 395 } 396 processAttributes( Stream<Entry<String, String>> in, String elementName, BiConsumer<AttributeKey, String> collectValueAttribute, CldrDataType dtdType)397 static Stream<Entry<String, String>> processAttributes( 398 Stream<Entry<String, String>> in, 399 String elementName, 400 BiConsumer<AttributeKey, String> collectValueAttribute, 401 CldrDataType dtdType) { 402 Consumer<Entry<String, String>> collectValueAttributes = e -> { 403 if (dtdType.isValueAttribute(elementName, e.getKey())) { 404 collectValueAttribute.accept(AttributeKey.keyOf(elementName, e.getKey()), e.getValue()); 405 } 406 }; 407 return in 408 // Special case of a synthetic distinguishing attribute that's _not_ in the DTD, and 409 // should be ignored. 410 .filter(e -> !e.getKey().equals(HIDDEN_SORT_INDEX_ATTRIBUTE)) 411 .peek(collectValueAttributes) 412 .filter(e -> dtdType.isDistinguishingAttribute(elementName, e.getKey())); 413 } 414 CldrPaths()415 private CldrPaths() {} 416 } 417