1 package org.unicode.cldr.api; 2 3 import static com.google.common.base.CharMatcher.anyOf; 4 import static com.google.common.base.CharMatcher.inRange; 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkElementIndex; 7 import static com.google.common.base.Preconditions.checkNotNull; 8 import static com.google.common.base.Preconditions.checkState; 9 import static java.util.stream.Collectors.toCollection; 10 11 import java.util.ArrayList; 12 import java.util.Comparator; 13 import java.util.List; 14 import java.util.Objects; 15 import java.util.Optional; 16 import java.util.stream.Collectors; 17 import java.util.stream.IntStream; 18 19 import org.unicode.cldr.api.AttributeKey.AttributeSupplier; 20 21 import com.google.common.base.CharMatcher; 22 import com.google.common.collect.ImmutableList; 23 import com.google.common.collect.ImmutableMap; 24 25 /** 26 * A sequence of CLDR path elements and "distinguishing" attributes. 27 * 28 * <p>In CLDR, a path is composed of a series of elements and associated attributes, where the 29 * attributes can be one of three types; "distinguishing", "value" and "metadata". 30 * 31 * <p>CldrPath instances hold only "distinguishing" attributes, with "value" attributes being held 32 * by the associated {@link CldrValue} instance, and "metadata" attributes being ignored completely 33 * since they are internal to the core CLDR classes. This approach ensures that {@code CldrPath} 34 * instances uniquely identify values and can be used as keys in maps. 35 * 36 * <p>When viewing {@code CldrPath} as strings, it is sometimes necessary to introduce an suffix to 37 * the path element name to indicate the "sort order". This is necessary to represent "ordered" 38 * elements which can appear in the LDML data (though these are rare). This suffix takes the form of 39 * {@code "{element-name}#{sort-index}"} where {@code {sort-index}} is a non-negative index which 40 * corresponds to the value returned from {@link #getSortIndex()} for that path element. The natural 41 * ordering of {@code CldrPath} handles the correctly, but you cannot sort paths properly by just 42 * comparing their string representation. 43 * 44 * <p>CldrPath is an immutable value type with efficient equality semantics. 45 * 46 * <p>See <a href="https://www.unicode.org/reports/tr35/#Definitions">the LDML specification</a> 47 * for more details. 48 */ 49 public final class CldrPath implements AttributeSupplier, Comparable<CldrPath> { 50 // This is approximate and DOES NOT promise correctness, it's mainly there to catch unexpected 51 // changes in the data and it can be updated or removed if needed. At the very least element 52 // names must not contain '/', '[', ']', '@', '=', '#' or '"' to permit re-parsing of paths. 53 private static final CharMatcher NAME_CHARACTERS = 54 inRange('a', 'z').or(inRange('A', 'Z')).or(inRange('0', '9')).or(anyOf(":-_")); 55 56 /** 57 * Parses a distinguishing CLDR path string, including "distinguishing" and private "metadata" 58 * attributes into a normalized {@link CldrPath} instance. Attributes will be parsed and handled 59 * according to their type: 60 * <ul> 61 * <li>Distinguishing attributes will be added to the returned CldrPath instance. 62 * <li>Non-public metadata attributes will be ignored. 63 * <li>Value attributes are not permitted in distinguishing paths and will cause an error. 64 * </ul> 65 * 66 * <p>The path string must be structured correctly (e.g. "//ldml/foo[@bar="baz]") and must 67 * represent a known DTD type, based on the first path element (e.g. "ldml" or 68 * "supplementalData"). 69 * 70 * @param path the distinguishing path string, containing only distinguishing attributes. 71 * @return the parsed distinguishing path instance. 72 * @throws IllegalArgumentException if the path is not well formed (e.g. contains unexpected 73 * value or metadata attributes). 74 */ parseDistinguishingPath(String path)75 public static CldrPath parseDistinguishingPath(String path) { 76 return CldrPaths.processXPath(path, ImmutableList.of(), (k, v) -> { 77 throw new IllegalArgumentException(String.format( 78 "unexpected value attribute '%s' in distinguishing path: %s", k, path)); 79 }); 80 } 81 82 /** 83 * Returns the number of common path elements from the root. This is useful when determining 84 * the last common ancestor during visitation. A {@code null} path is treated as having zero 85 * length (and will always result in zero being returned). 86 * 87 * <p>Note: This is only currently use by PrefixVisitorHost, but could be made public if needed. 88 * It's only here (rather than in PrefixVisitorHost) because it depends on private methods of 89 * this class. 90 */ getCommonPrefixLength( CldrPath a, CldrPath b)91 static int getCommonPrefixLength(/* @Nullable */ CldrPath a, /* @Nullable */ CldrPath b) { 92 // Null paths are sentinels with zero length. 93 if (a == null || b == null) { 94 return 0; 95 } 96 97 // Trim whichever path is longer until both are same length. 98 int minLength = Math.min(a.getLength(), b.getLength()); 99 while (a.getLength() > minLength) 100 a = a.getParent(); 101 while (b.getLength() > minLength) 102 b = b.getParent(); 103 104 // Work up the paths, resetting the common length every time the elements differ. 105 int commonLength = minLength; 106 for (int length = minLength; length > 0; length--) { 107 if (!a.localEquals(b)) { 108 // Elements differ, so shortest possible common length is that of our parent path. 109 commonLength = length - 1; 110 } 111 // Parents will both be null on the last iteration, but never used. 112 a = a.getParent(); 113 b = b.getParent(); 114 } 115 return commonLength; 116 } 117 118 // If the parent is null then this path element is the top level LDML descriptor. 119 /* @Nullable */ private final CldrPath parent; 120 // The number of elements (including this one) in this path. 121 private final int length; 122 private final String elementName; 123 // The attribute keys and values in an alternating list. 124 private final ImmutableList<String> attributeKeyValuePairs; 125 // Inherits the top-most draft status in a path (which matches what CLDRFile appears to do. 126 private final Optional<CldrDraftStatus> draftStatus; 127 // The sort index for "ORDERED" path elements or (more commonly) -1 for non-ORDERED elements. 128 private final int sortIndex; 129 // The DTD type of the path (which is the same for all path elements). 130 private final CldrDataType dtdType; 131 // The proper DTD ordering for this path (based on the DTD type). 132 private final Comparator<CldrPath> ordering; 133 // Cached since ImmutableList recalculates its hash codes every time. 134 private final int hashCode; 135 // Cached on demand (it's useful during equality checking as well as toString() itself). 136 // However for elements with attributes, it's at least as large as the name and all attribute 137 // keys/values combined, so will typically double the in-memory size of the instance. We don't 138 // care about making assignment atomic however, since all values would be equal anyway. 139 private String localToString = null; 140 CldrPath(CldrPath parent, String name, List<String> attributeKeyValuePairs, CldrDataType dtdType, CldrDraftStatus localDraftStatus, int sortIndex)141 CldrPath(CldrPath parent, 142 String name, 143 List<String> attributeKeyValuePairs, 144 CldrDataType dtdType, 145 /* @Nullable */ CldrDraftStatus localDraftStatus, int sortIndex) { 146 checkState(parent != null || dtdType.getLdmlName().equals(name), 147 "unexpected root element: expected %s, but got %s", dtdType.getLdmlName(), name); 148 this.parent = parent; 149 this.length = (parent != null ? parent.getLength() : 0) + 1; 150 this.elementName = checkValidName(name, "element"); 151 this.attributeKeyValuePairs = checkKeyValuePairs(attributeKeyValuePairs); 152 this.draftStatus = resolveDraftStatus(parent, localDraftStatus); 153 // Ordered elements have a sort index of 0 or more, and un-ordered have an index of -1. 154 if (CldrPaths.isOrdered(dtdType, elementName)) { 155 checkArgument(sortIndex >= 0, 156 "missing or invalid sort index '%s' for element: %s", sortIndex, elementName); 157 } else { 158 checkArgument(sortIndex == -1, 159 "unexpected sort index '%s' for element: %s", sortIndex, elementName); 160 } 161 this.sortIndex = sortIndex; 162 this.dtdType = checkNotNull(dtdType); 163 this.ordering = CldrPaths.getPathComparator(dtdType); 164 this.hashCode = Objects.hash(length, name, this.attributeKeyValuePairs, parent); 165 } 166 checkValidName(String value, String description)167 private static String checkValidName(String value, String description) { 168 checkArgument(!value.isEmpty(), "%s name cannot be empty", description); 169 checkArgument(NAME_CHARACTERS.matchesAllOf(value), 170 "invalid character in %s name: %s", description, value); 171 return value; 172 } 173 checkKeyValuePairs(List<String> keyValuePairs)174 private static ImmutableList<String> checkKeyValuePairs(List<String> keyValuePairs) { 175 // Ensure attribute values never have double-quote in them (since current we don't escape 176 // value when putting into toString(). 177 checkArgument((keyValuePairs.size() & 1) == 0, 178 "key/value pairs must have an even number of elements: %s", keyValuePairs); 179 for (int n = 0; n < keyValuePairs.size(); n += 2) { 180 checkValidName(keyValuePairs.get(n), "attribute"); 181 String v = keyValuePairs.get(n + 1); 182 checkArgument(!v.contains("\""), "unsupported '\"' in attribute value: %s", v); 183 } 184 return ImmutableList.copyOf(keyValuePairs); 185 } 186 187 /** 188 * Helper method used to test for equivalent path element content. Used to help avoid 189 * unnecessary allocations during processing (see {@link CldrFileDataSource}). 190 */ matchesContent( String name, int sortIndex, List<String> keyValuePairs, CldrDraftStatus draftStatus)191 boolean matchesContent( 192 String name, 193 int sortIndex, 194 List<String> keyValuePairs, /* Nullable */ CldrDraftStatus draftStatus) { 195 return this.elementName.equals(name) 196 && this.sortIndex == sortIndex 197 && this.attributeKeyValuePairs.equals(keyValuePairs) 198 && this.draftStatus.equals(resolveDraftStatus(this.parent, draftStatus)); 199 } 200 201 // Helper to resolve the current draft status of a path based on any local draft status 202 // attributes and any existing parent status. 203 // 204 // Note that this behaviour currently matches CLDRFile behaviour as of May 2019. CLDRFile 205 // does a regex match over the full path and finds the first draft status attribute that's 206 // present, ignoring any later declarations (even if they are more restrictive). It seems 207 // likely that the implicit expectation in CLDRFile is that there's only ever one draft 208 // status attributes in any given path (see the pop() method in MyDeclHandler). resolveDraftStatus( CldrPath parent, CldrDraftStatus localStatus)209 private static Optional<CldrDraftStatus> resolveDraftStatus( 210 /* @Nullable */ CldrPath parent, /* @Nullable */ CldrDraftStatus localStatus) { 211 if (parent != null && parent.draftStatus.isPresent()) { 212 return parent.draftStatus; 213 } 214 return localStatus != null ? localStatus.asOptional() : Optional.empty(); 215 } 216 217 /** 218 * Returns the parent path element. 219 * 220 * @return the parent path element (or {@code null} if this was the root element). 221 */ 222 /* @Nullable */ getParent()223 public CldrPath getParent() { 224 return parent; 225 } 226 227 /** 228 * Returns the non-zero length of this path (i.e. the number of elements it is made up of). 229 * 230 * @return the number of elements in this path. 231 */ getLength()232 public int getLength() { 233 return length; 234 } 235 236 /** 237 * Returns the name of this path element. This is the qualified XML name as it appears in the 238 * CLDR XML files, including namespace (e.g. "icu:transforms") though most attributes are not 239 * part of an explicit namespace. 240 * 241 * @return the ASCII-only name of the "lowest" element in this path. 242 */ 243 // TODO: This method name is weak - perhaps getElementName() ? getName()244 public String getName() { 245 return elementName; 246 } 247 248 /** 249 * Returns the sort index of an "ordered" path element, or {@code -1} for non-ordered elements. 250 * 251 * <p>The sort index is used to disambiguate and sort otherwise identical distinguishing paths. 252 * The feature allows a series of values to be processed reliably in an ordered sequence. It is 253 * recommended that if your data includes "ordered" elements, they are always processed in 254 * {@link org.unicode.cldr.api.CldrData.PathOrder#DTD DTD} order. 255 * 256 * @return the element's sort index (which takes priority for element ordering over any 257 * attribute values). 258 */ getSortIndex()259 public int getSortIndex() { 260 return sortIndex; 261 } 262 263 /** 264 * Returns the raw value of an attribute associated with this CLDR path, or null if not present. 265 * For almost all use cases it is preferable to use the accessor methods on the {@link 266 * AttributeKey} class, which provide additional useful semantic checking and common type 267 * conversion. You should only use this method directly if there's a strong performance 268 * requirement. 269 * 270 * @param key the key identifying an attribute. 271 * @return the attribute value or {@code null} if not present. 272 * @see AttributeKey 273 */ 274 @Override get(AttributeKey key)275 /* @Nullable */ public String get(AttributeKey key) { 276 checkArgument(!getDataType().isValueAttribute(key), 277 "cannot get 'value attribute' values from a distinguishing path: %s", key); 278 String v = null; 279 for (CldrPath p = this; v == null && p != null; p = p.getParent()) { 280 if (p.getName().equals(key.getElementName())) { 281 v = p.getLocalAttributeValue(key.getAttributeName()); 282 } 283 } 284 return v; 285 } 286 287 /** 288 * Returns the data type for this path, as defined by the top most element. 289 * 290 * @return the path's data type. 291 */ 292 @Override getDataType()293 public CldrDataType getDataType() { 294 return dtdType; 295 } 296 297 /** 298 * Returns whether the given element name is in this path. 299 * 300 * @param elementName the element name to check. 301 * @return true if the name of this path element or any of its parents is equal to the given 302 * element name. 303 */ containsElement(String elementName)304 boolean containsElement(String elementName) { 305 return getName().equals(elementName) 306 || (parent != null && parent.containsElement(elementName)); 307 } 308 309 /** 310 * Returns the number of distinguishing attributes for this path element. 311 * 312 * @return the number of distinguishing attributes for the "lowest" element in this path. 313 */ getAttributeCount()314 int getAttributeCount() { 315 return attributeKeyValuePairs.size() / 2; 316 } 317 318 /** 319 * Returns the (non empty) name of the Nth distinguishing attribute in the leaf path element. 320 * 321 * @param n the index of a distinguishing attribute for the "lowest" element in this path. 322 * @return the name of the Nth attribute on this path element. 323 */ getLocalAttributeName(int n)324 String getLocalAttributeName(int n) { 325 checkElementIndex(n, getAttributeCount()); 326 return attributeKeyValuePairs.get(2 * n); 327 } 328 329 /** 330 * Returns the value of the Nth distinguishing attribute in the leaf path element. 331 * 332 * @param n the index of a distinguishing attribute for the "lowest" element in this path. 333 * @return the value of the Nth attribute on this path element. 334 */ getLocalAttributeValue(int n)335 String getLocalAttributeValue(int n) { 336 checkElementIndex(n, getAttributeCount()); 337 return attributeKeyValuePairs.get((2 * n) + 1); 338 } 339 340 // Helper to get an attribute by name from this path element. getLocalAttributeValue(String attributeName)341 private String getLocalAttributeValue(String attributeName) { 342 checkNotNull(attributeName); 343 for (int i = 0; i < attributeKeyValuePairs.size(); i += 2) { 344 if (attributeName.equals(attributeKeyValuePairs.get(i))) { 345 return attributeKeyValuePairs.get(i + 1); 346 } 347 } 348 return null; 349 } 350 351 /** 352 * Returns the draft status for this path, which is inherited from parent path elements. Note 353 * that where multiple (possibly conflicting) draft statuses are defined in a path, the "top 354 * most" (i.e. closest to the root element) value is used. 355 * 356 * @return the potentially inherited draft status. 357 */ getDraftStatus()358 Optional<CldrDraftStatus> getDraftStatus() { 359 return draftStatus; 360 } 361 362 /** 363 * Returns a combined full path string in the XPath style {@code //foo/bar[@x="y"]/baz}, 364 * with value attributes inserted in correct DTD order for each path element. 365 * 366 * <p>Note that while in most cases the values attributes simply follow the path attributes on 367 * each element, this is not necessarily always true, and DTD ordering can place value 368 * attributes before path attributes in an element. 369 * 370 * @param value a value to be associated with this path (from which value attributes will be 371 * obtained). 372 * @return the full XPath representation containing both distinguishing and value attributes. 373 */ getFullPath(CldrValue value)374 String getFullPath(CldrValue value) { 375 checkNotNull(value); 376 return appendToString(new StringBuilder(), value.getValueAttributes()).toString(); 377 } 378 379 /** Compares two paths in DTD order. */ 380 @Override compareTo(CldrPath other)381 public int compareTo(CldrPath other) { 382 if (dtdType == other.dtdType) { 383 return ordering.compare(this, other); 384 } 385 return dtdType.compareTo(other.dtdType); 386 } 387 388 /** {@inheritDoc} */ 389 @Override equals(Object obj)390 public boolean equals(Object obj) { 391 if (obj == this) { 392 return true; 393 } 394 if (!(obj instanceof CldrPath)) { 395 return false; 396 } 397 CldrPath other = (CldrPath) obj; 398 // Check type and length first (checking length catches most non-equal paths). 399 if (!getDataType().equals(other.getDataType()) || length != other.length) { 400 return false; 401 } 402 // Do (n - 1) comparisons since we already know the root element is the same (the root 403 // element never has value attributes on it and is uniquely defined by the DTD type). 404 // Working up from the "leaf" of the path works well because different paths will almost 405 // always be different in the leaf element (i.e. we will fail almost immediately). 406 CldrPath pthis = this; 407 for (int n = length - 1; n > 0; n--) { 408 if (!pthis.localEquals(other)) { 409 return false; 410 } 411 pthis = pthis.getParent(); 412 other = other.getParent(); 413 } 414 return true; 415 } 416 417 /** {@inheritDoc} */ 418 @Override hashCode()419 public int hashCode() { 420 return hashCode; 421 } 422 423 /** @return the distinguishing path string in the XPath style {@code //foo/bar[@x="y"]/baz}. */ 424 @Override toString()425 public String toString() { 426 return appendToString(new StringBuilder(), ImmutableMap.of()).toString(); 427 } 428 localEquals(CldrPath other)429 private boolean localEquals(CldrPath other) { 430 // In _theory_ only length and localToString need to be checked (since the localToString is 431 // an unambiguous representation of the data, but it seems a bit hacky to rely on that. 432 return this.elementName.equals(other.elementName) 433 && this.sortIndex == other.sortIndex 434 && this.attributeKeyValuePairs.equals(other.attributeKeyValuePairs); 435 } 436 437 // XPath like toString() representation of a path element (e.g. foo[@bar="x"][@baz="y"]). 438 // When a sort index is present, it's appended to the element name (e.g. "foo#42[@bar="x"]"). getLocalToString()439 private String getLocalToString() { 440 if (localToString == null) { 441 String str = getName(); 442 if (sortIndex != -1) { 443 // This is very rare so it's almost certainly better to not make a StringBuilder 444 // above just for this possibility. 445 str += "#" + sortIndex; 446 } 447 if (getAttributeCount() > 0) { 448 str = IntStream.range(0, getAttributeCount()) 449 .mapToObj(n -> 450 String.format("[@%s=\"%s\"]", getLocalAttributeName(n), getLocalAttributeValue(n))) 451 .collect(Collectors.joining("", str, "")); 452 } 453 // Overwrite only once the local string is completed (this is idempotent so we don't 454 // have to care about locking etc.). 455 localToString = str; 456 } 457 return localToString; 458 } 459 460 // Recursive helper for toString(). appendToString( StringBuilder out, ImmutableMap<AttributeKey, String> valueAttributes)461 private StringBuilder appendToString( 462 StringBuilder out, ImmutableMap<AttributeKey, String> valueAttributes) { 463 CldrPath parent = getParent(); 464 if (parent != null) { 465 parent.appendToString(out, valueAttributes).append('/'); 466 } else { 467 out.append("//"); 468 } 469 if (valueAttributes.isEmpty()) { 470 return out.append(getLocalToString()); 471 } 472 List<String> attributeNames = 473 valueAttributes.keySet().stream() 474 .filter(k -> k.getElementName().equals(getName())) 475 .map(AttributeKey::getAttributeName) 476 .collect(toCollection(ArrayList::new)); 477 if (attributeNames.isEmpty()) { 478 // No value attributes for _this_ element so can use just the local toString(). 479 return out.append(getLocalToString()); 480 } 481 if (getAttributeCount() > 0) { 482 String lastPathAttributeName = getLocalAttributeName(getAttributeCount() - 1); 483 if (dtdType.getAttributeComparator() 484 .compare(lastPathAttributeName, attributeNames.get(0)) > 0) { 485 // Oops, order is not as expected, so must reorder all attributes. 486 appendResortedValueAttributesTo(out, attributeNames, valueAttributes); 487 return out; 488 } 489 } 490 // Value attributes all come after path attributes. 491 return appendValueAttributesTo(out.append(getLocalToString()), attributeNames, valueAttributes); 492 } 493 appendResortedValueAttributesTo( StringBuilder out, List<String> attributeNames, ImmutableMap<AttributeKey, String> valueAttributes)494 private void appendResortedValueAttributesTo( 495 StringBuilder out, 496 List<String> attributeNames, 497 ImmutableMap<AttributeKey, String> valueAttributes) { 498 out.append(elementName); 499 for (int n = 0; n < attributeKeyValuePairs.size(); n += 2) { 500 attributeNames.add(attributeKeyValuePairs.get(n)); 501 } 502 attributeNames.sort(dtdType.getAttributeComparator()); 503 for (String attrName : attributeNames) { 504 String value = getLocalAttributeValue(attrName); 505 if (value == null) { 506 value = valueAttributes.get(AttributeKey.keyOf(elementName, attrName)); 507 checkState(value != null, "missing value %s:%s", elementName, attrName); 508 } 509 out.append(String.format("[@%s=\"%s\"]", attrName, value)); 510 } 511 } 512 appendValueAttributesTo( StringBuilder out, List<String> attributeNames, ImmutableMap<AttributeKey, String> valueAttributes)513 private StringBuilder appendValueAttributesTo( 514 StringBuilder out, 515 List<String> attributeNames, 516 ImmutableMap<AttributeKey, String> valueAttributes) { 517 for (String attrName : attributeNames) { 518 String value = valueAttributes.get(AttributeKey.keyOf(elementName, attrName)); 519 checkState(value != null, "missing value %s:%s", elementName, attrName); 520 out.append(String.format("[@%s=\"%s\"]", attrName, value)); 521 } 522 return out; 523 } 524 } 525