• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.api;
2 
3 import static com.google.common.base.Preconditions.checkArgument;
4 import static com.google.common.base.Preconditions.checkState;
5 import static java.lang.Integer.signum;
6 import static org.unicode.cldr.api.CldrDataType.LDML;
7 
8 import java.util.ArrayList;
9 import java.util.Comparator;
10 import java.util.List;
11 import java.util.Map;
12 import java.util.Map.Entry;
13 import java.util.function.BiConsumer;
14 import java.util.function.Consumer;
15 import java.util.function.Predicate;
16 import java.util.stream.Stream;
17 
18 import org.unicode.cldr.util.DtdData;
19 import org.unicode.cldr.util.XPathParts;
20 
21 import com.google.common.collect.ImmutableMap;
22 import com.google.common.collect.ImmutableSetMultimap;
23 
24 /**
25  * Utilities related to CLDR paths. It's possible that one day we might wish to expose the path
26  * comparator from this class, but if so we should key it off something other than {@code DtdType}.
27  */
28 final class CldrPaths {
29     // Constants to make comparator logic a bit more readable (i.e. LHS < RHS ==> return -1).
30     private static final int LHS_FIRST = -1;
31     private static final int RHS_FIRST = 1;
32 
33     // A synthetic attribute used by CLDR code to apply an explicit ordering to otherwise identical
34     // paths (this only happens for "ORDERED" elements). We handle this differently and have the
35     // sort index as an explicit field in CldrPath rather than treating it as an attribute.
36     private static final String HIDDEN_SORT_INDEX_ATTRIBUTE = "_q";
37 
38     // When calculating things about elements we need to ignore deprecated ones, especially when
39     // looking for "leaf" elements, since CLDR data used to allow mixed content and the DTD still
40     // has elements with both data and (deprecated) child elements present.
41     private static final Predicate<DtdData.Element> IS_NOT_DEPRECATED = e -> !e.isDeprecated();
42 
43     // A map of the leaf element names for each supported DTD.
44     private static final ImmutableSetMultimap<CldrDataType, String> LEAF_ELEMENTS_MAP;
45     // A map of the ordered element names for each supported DTD.
46     private static final ImmutableSetMultimap<CldrDataType, String> ORDERED_ELEMENTS_MAP;
47     static {
48         ImmutableSetMultimap.Builder<CldrDataType, String> leafElementsMap =
49             ImmutableSetMultimap.builder();
50         ImmutableSetMultimap.Builder<CldrDataType, String> orderedElementsMap =
51             ImmutableSetMultimap.builder();
52         for (CldrDataType type : CldrDataType.values()) {
53             // While at happened to be true (at the time of writing) that the getElements() method
54             // returns a new, mutable set, this is completely undocumented so we cannot rely on it
55             // (otherwise we could just do "removeIf(...)").
56             //
57             // Note that while the "specials" element has no children in the DTD, it is not
58             // considered a "leaf" element as it is expected to contain any additional elements
59             // from unknown namespaces (e.g. "icu:").
60             //
61             // We know that Guava's collection classes have a policy of never iterating over a
62             // collection more than once, so it's safe to use the ::iterator trick to convert a
63             // stream into a one-shot iterable (saves have to make temporary collections).
leafElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED)) .map(DtdData.Element::getName) .filter(e -> !e.equals("special")) ::iterator)64             leafElementsMap.putAll(
65                 type,
66                 type.getElements()
67                     .filter(IS_NOT_DEPRECATED)
68                     // NOTE: Some leaf elements still have deprecated children (from when mixed
69                     // data was permitted).
70                     .filter(e -> e.getChildren().keySet().stream().noneMatch(IS_NOT_DEPRECATED))
71                     .map(DtdData.Element::getName)
72                     .filter(e -> !e.equals("special"))
73                     ::iterator);
orderedElementsMap.putAll( type, type.getElements() .filter(IS_NOT_DEPRECATED) .filter(DtdData.Element::isOrdered) .map(DtdData.Element::getName) ::iterator)74             orderedElementsMap.putAll(
75                 type,
76                 type.getElements()
77                     .filter(IS_NOT_DEPRECATED)
78                     .filter(DtdData.Element::isOrdered)
79                     .map(DtdData.Element::getName)
80                     ::iterator);
81         }
82         // Special case "alias" is an alternate leaf element for a lot of LDML elements.
leafElementsMap.put(LDML, "alias")83         leafElementsMap.put(LDML, "alias");
84         LEAF_ELEMENTS_MAP = leafElementsMap.build();
85         ORDERED_ELEMENTS_MAP = orderedElementsMap.build();
86     }
87 
88     // A map of path comparators for each supported DTD.
89     private static final ImmutableMap<CldrDataType, DtdPathComparator> PATH_COMPARATOR_MAP;
90 
91     // This static block must come after the attribute map is created since it uses it.
92     static {
93         ImmutableMap.Builder<CldrDataType, DtdPathComparator> pathMap = ImmutableMap.builder();
94         for (CldrDataType type : CldrDataType.values()) {
pathMap.put(type, new DtdPathComparator(type))95             pathMap.put(type, new DtdPathComparator(type));
96         }
97         PATH_COMPARATOR_MAP = pathMap.build();
98     }
99 
100     /**
101      * Returns a comparator for {@link CldrPath}s which is compatible with the canonical path
102      * ordering for the given DTD instance (e.g. {@link DtdData#getDtdComparator}()).
103      */
104     // TODO: Add tests to ensure it continues to agree to DTD ordering.
getPathComparator(CldrDataType type)105     static Comparator<CldrPath> getPathComparator(CldrDataType type) {
106         return PATH_COMPARATOR_MAP.get(type);
107     }
108 
109     private static final class DtdPathComparator implements Comparator<CldrPath> {
110         private final Comparator<String> elementNameComparator;
111         private final Comparator<String> attributeNameComparator;
112 
DtdPathComparator(CldrDataType dataType)113         private DtdPathComparator(CldrDataType dataType) {
114             this.elementNameComparator = dataType.getElementComparator();
115             this.attributeNameComparator = dataType.getAttributeComparator();
116         }
117 
118         // This code should only return "signum" values for ordering (i.e. {-1, 0, 1}).
119         @Override
compare(CldrPath lhs, CldrPath rhs)120         public int compare(CldrPath lhs, CldrPath rhs) {
121             int length = lhs.getLength();
122             if (length == rhs.getLength()) {
123                 if (length == 1) {
124                     // Root nodes are special as they define the DTD type and must always be equal.
125                     // Paths with different types can be compared, but not via this comparator.
126                     checkState(lhs.getName().equals(rhs.getName()),
127                         "cannot compare paths with different DTD type: %s / %s", lhs, rhs);
128                     return 0;
129                 }
130                 // Compare parent paths first and return if they give an ordering.
131                 int signum = compare(lhs.getParent(), rhs.getParent());
132                 return (signum != 0) ? signum : compareCurrentElement(lhs, rhs);
133             } else if (length < rhs.getLength()) {
134                 // Recursively shorten the RHS path until it's the same length.
135                 int signum = compare(lhs, rhs.getParent());
136                 // If the LHS is equal to the RHS parent, then the (shorter) LHS path comes first.
137                 // Note this this can only happen if we are comparing non-leaf node paths.
138                 return signum != 0 ? signum : LHS_FIRST;
139             } else {
140                 // Flip the comparison if LHS was longer (we do this at most once per comparison).
141                 return -compare(rhs,  lhs);
142             }
143         }
144 
compareCurrentElement(CldrPath lhs, CldrPath rhs)145         private int compareCurrentElement(CldrPath lhs, CldrPath rhs) {
146             String elementName = lhs.getName();
147             int signum = signum(elementNameComparator.compare(elementName, rhs.getName()));
148             if (signum != 0) {
149                 return signum;
150             }
151             // Primary order within a path element is defined by the element index (this is a
152             // bit of a special value used only for "ORDERED" elements in the DTD. This always
153             // trumps any attribute ordering but is almost always -1.
154             signum = Integer.compare(lhs.getSortIndex(), rhs.getSortIndex());
155             if (signum != 0) {
156                 return signum;
157             }
158             // Element name is the same, so test attributes. Attributes are already known to be
159             // ordered by the element's DTD order, so we only need to find and compare the first
160             // difference.
161             int minAttributeCount =
162                 Math.min(lhs.getAttributeCount(), rhs.getAttributeCount());
163             for (int n = 0; n < minAttributeCount && signum == 0; n++) {
164                 String attributeName = lhs.getLocalAttributeName(n);
165                 // Important: We negate the comparison result here because we want elements with
166                 // "missing" attributes to sort earlier.
167                 //
168                 // E.g. for two elements LHS="foo[a=x][b=y]" and RHS="foo[b=y]" we want to say
169                 // that "RHS < LHS" because RHS is "missing" the attribute "[a=x]". But when we
170                 // compare the first attributes we find "a" (LHS) < "b" (RHS), which is the
171                 // opposite of what we want. This is because while LHS has a lower ordered
172                 // attribute, that indicates that RHS is missing that attribute in the same
173                 // position, which should make RHS sort first.
174                 signum = -signum(
175                     attributeNameComparator.compare(attributeName, rhs.getLocalAttributeName(n)));
176                 if (signum == 0) {
177                     // Attribute names equal, so test attribute value.
178                     signum = signum(
179                         DtdData.getAttributeValueComparator(elementName, attributeName)
180                             .compare(lhs.getLocalAttributeValue(n), rhs.getLocalAttributeValue(n)));
181                 }
182             }
183             if (signum == 0) {
184                 // Attributes match up to the minimum length, but one element might have more.
185                 // Elements with more attributes sort _after_ those without.
186                 if (lhs.getAttributeCount() > minAttributeCount) {
187                     signum = RHS_FIRST;
188                 } else if (rhs.getAttributeCount() > minAttributeCount) {
189                     signum = LHS_FIRST;
190                 }
191             }
192             return signum;
193         }
194     }
195 
196     /** Returns whether this path is a full path ending in a "leaf" element. */
isLeafPath(CldrPath path)197     static boolean isLeafPath(CldrPath path) {
198         String lastElementName = path.getName();
199         return lastElementName.indexOf(':') != -1
200             || LEAF_ELEMENTS_MAP.get(path.getDataType()).contains(lastElementName);
201     }
202 
203     /**
204      * Returns whether an element is "ORDERED" in the specified DTD (and should have an explicit
205      * sort index).
206      */
isOrdered(CldrDataType dtdType, String elementName)207     static boolean isOrdered(CldrDataType dtdType, String elementName) {
208         // Elements with namespaces unknown to the DTD are never ordered
209         // (but it knows about icu: elements).
210         if (elementName.indexOf(':') != -1 && !elementName.startsWith("icu:")) {
211             return false;
212         }
213         // Returns empty set if DTD unknown, but it might also be the case that a valid DTD has no
214         // ordered elements, so we can't reasonable check for anything here.
215         return ORDERED_ELEMENTS_MAP.get(dtdType).contains(elementName);
216     }
217 
218     // This can't go further up due to static initialization ordering issues.
219     // TODO: Move all reading of DTDs and setup for paths into a lazy holder.
220     private static final CldrPath LDML_VERSION =
221         CldrPath.parseDistinguishingPath("//ldml/identity/version");
222 
223     /**
224      * Returns whether this path should be emitted by a data supplier. New cases can be added
225      * as needed.
226      */
shouldEmit(CldrPath path)227     static boolean shouldEmit(CldrPath path) {
228         // CLDRFile seems to do some interesting things with the version based on the DTD in
229         // which we see:
230         //
231         // <!ATTLIST version number CDATA #REQUIRED >
232         //    <!--@MATCH:regex/\$Revision.*\$-->
233         //    <!--@METADATA-->
234         //
235         // <!ATTLIST version cldrVersion CDATA #FIXED "36" >
236         //    <!--@MATCH:any-->
237         //    <!--@VALUE-->
238         //
239         // This results in conflict between values obtained via CLDRFile and those obtained
240         // directly by parsing an LDML XML file (which happens for "special" XML files in ICU).
241         //
242         // So for now I've decided to just ignore the version (since it's hardly used in the
243         // ICU converter code and always available via CldrDataSupplier#getCldrVersionString().
244 
245         // Hacky way to detect the version declaration in LDML files (which must always be
246         // skipped since they are duplicate paths and reveal XML file boundaries). The path is
247         // always "//ldmlXxxx/version" for some DTD type.
248         if (path.getLength() == 2 && path.getName().equals("version")) {
249             return false;
250         }
251 
252         // Note that there is a need for some kind of versioning for some bits of data (since
253         // changes to things like collation can invalidate database search indexes) but this should
254         // be handled directly in the logic which builds the ICU data and isn't strictly the same
255         // as the LDML version anyway.
256         //
257         // TODO: Remove this code if and when LDML version strings are removed.
258         return !path.equals(LDML_VERSION);
259     }
260 
261     /**
262      * Processes a full path to extract a distinguishing CldrPath and handle any value attributes
263      * present. This is designed for iterating over successive paths from CLDRFile, but can be used
264      * to create paths in isolation if necessary.
265      *
266      * @param fullPath a parsed full path.
267      * @param previousElements an optional list of previous CldrPath elements which will be used
268      *     as the prefix to the new path wherever possible (e.g. if previousElements="(a,b,c,d)"
269      *     and "fullPath=a/b/x/y/z", then the new path will share the path prefix up to and
270      *     including 'b'). When processing sorted paths, this will greatly reduce allocations.
271      * @param valueAttributeCollector a collector into which value attributes will be added (in
272      *     DTD order).
273      * @return a new CldrPath corresponding to the distinguishing path of {@code fullPath}.
274      * @throws IllegalArgumentException if the path string is invalid.
275      */
processXPath( String fullPath, List<CldrPath> previousElements, BiConsumer<AttributeKey, String> valueAttributeCollector)276     static CldrPath processXPath(
277         String fullPath,
278         List<CldrPath> previousElements,
279         BiConsumer<AttributeKey, String> valueAttributeCollector) {
280         checkArgument(!fullPath.isEmpty(), "path must not be empty");
281         // This fails if attribute names are invalid, but not if element names are invalid, and we
282         // want to control/stabalize the error messages in this API.
283         XPathParts pathParts;
284         try {
285             pathParts = XPathParts.getFrozenInstance(fullPath);
286         } catch (IllegalArgumentException e) {
287             throw new IllegalArgumentException("invalid path: " + fullPath);
288         }
289         int length = pathParts.size();
290         checkArgument(length > 0, "cldr path must not be empty: %s", pathParts);
291         DtdData dtd = pathParts.getDtdData();
292         checkArgument(dtd != null, "unknown DTD type: %s", pathParts);
293         CldrDataType dtdType = CldrDataType.forRawType(dtd.dtdType);
294 
295         // The path we're returning, constructed from top-to-bottom.
296         CldrPath path = null;
297         // Reusable key/value attributes list.
298         List<String> keyValuePairs = new ArrayList<>();
299         Consumer<Entry<String, String>> collectElementAttribute = e -> {
300             keyValuePairs.add(e.getKey());
301             keyValuePairs.add(e.getValue());
302         };
303         // True once we've started to create a new path rather than reusing previous elements.
304         boolean diverged = false;
305         for (int n = 0; n < length; n++) {
306             String elementName = pathParts.getElement(n);
307 
308             // If this path is from CldrPath.toString() then the sort index is encoded in the
309             // element name suffix (e.g. foo#42[@bar="x"]), otherwise it's in the synthetic "_q"
310             // attribute. Most often there is no sort index however, so this code should optimize
311             // to the null case.
312             int sortIndex = -1;
313             Map<String, String> attributes = pathParts.getAttributes(n);
314             int nameEnd = elementName.indexOf('#');
315             if (nameEnd != -1) {
316                 sortIndex = Integer.parseUnsignedInt(elementName.substring(nameEnd + 1));
317                 elementName = elementName.substring(0, nameEnd);
318             } else {
319                 String sortIndexStr = attributes.get(HIDDEN_SORT_INDEX_ATTRIBUTE);
320                 if (sortIndexStr != null) {
321                     sortIndex = Integer.parseUnsignedInt(sortIndexStr);
322                 }
323             }
324 
325             // Note that element names need not be known to the DTD. If an element's name is
326             // prefixed with an unknown namespace (e.g. "icu:") then it is always permitted.
327             // Similarly, we never filter out attributes in unknown namespaces. For now we assume
328             // that any explicit namespace is unknown.
329             boolean hasNamespace = elementName.indexOf(':') != -1;
330             checkArgument(hasNamespace || dtd.getElementFromName().containsKey(elementName),
331                 "invalid path: %s", fullPath);
332 
333             // The keyValuePairs list is used by the collectElementAttribute callback but we don't
334             // want/need to make a new callback each time, so just clear the underlying list.
335             keyValuePairs.clear();
336             processPathAttributes(
337                 elementName, attributes, dtdType, collectElementAttribute, valueAttributeCollector);
338 
339             // WARNING: We cannot just get the draft attribute value from the attributes map (as
340             // would be expected) because it throws an exception. This is because you can only
341             // "get()" values from the attributes map if they are potentially valid attributes for
342             // that element. Unfortunately this is due to a deliberate choice in the implementation
343             // of MapComparator, which explicitly throws an exception if an unknown attribute is
344             // encountered. Thus we have to check that "draft" is a possible attribute before
345             // asking for its value.
346             //
347             // TODO(dbeaumont): Fix this properly, ideally by fixing the comparator to not throw.
348             CldrDraftStatus draftStatus = dtd.getAttribute(elementName, "draft") != null
349                 ? CldrDraftStatus.forString(attributes.get("draft")) : null;
350 
351             if (!diverged && n < previousElements.size()) {
352                 CldrPath p = previousElements.get(n);
353                 if (p.matchesContent(elementName, sortIndex, keyValuePairs, draftStatus)) {
354                     // The previous path we processed was the same at least down to this depth, so
355                     // we can just reuse that element instead of creating a new one.
356                     //
357                     // In tests over the resolved paths for "en_GB" in DTD order, there are ~128k
358                     // path elements processed, with ~103k being reused here and ~25k being newly
359                     // allocated (a > 80% reuse rate). However this works best when the incoming
360                     // paths are sorted, since otherwise the "previous" path is random.
361                     //
362                     // For unsorted paths, the reuse rate is reduced to approximately 25%. This is
363                     // still saving ~32k allocations for the "en_GB" example, and it still saved
364                     // about 35% in terms of measured time to process the paths.
365                     path = p;
366                     continue;
367                 }
368             }
369             // This path has diverged from the previous path, so we must start making new elements.
370             path = new CldrPath(path, elementName, keyValuePairs, dtdType, draftStatus, sortIndex);
371             diverged = true;
372         }
373         return path;
374     }
375 
376     // Returns the element's sort index (or -1 if not present).
processPathAttributes( String elementName, Map<String, String> attributeMap, CldrDataType dtdType, Consumer<Entry<String, String>> collectElementAttribute, BiConsumer<AttributeKey, String> collectValueAttribute)377     static void processPathAttributes(
378         String elementName,
379         /* @Nullable */ Map<String, String> attributeMap,
380         CldrDataType dtdType,
381         Consumer<Entry<String, String>> collectElementAttribute,
382         BiConsumer<AttributeKey, String> collectValueAttribute) {
383 
384         // Why not just a Map? Maps don't efficiently handle "get the Nth element" which is
385         // something that's used a lot in the ICU conversion code. Distinguishing attributes in
386         // paths are used more as a "list of pairs" than a map with key/value lookup. Even
387         // extensions like "NavigableMap" don't give you what you want.
388         //
389         // This does mean that lookup-by-name is a linear search (unless you want to pay the cost
390         // of also having a map here) but the average number of attributes is very small (<3).
391         if (attributeMap != null && !attributeMap.isEmpty()) {
392             processAttributes(
393                 attributeMap.entrySet().stream(), elementName, collectValueAttribute, dtdType)
394                 .forEach(collectElementAttribute);
395         }
396     }
397 
processAttributes( Stream<Entry<String, String>> in, String elementName, BiConsumer<AttributeKey, String> collectValueAttribute, CldrDataType dtdType)398     static Stream<Entry<String, String>> processAttributes(
399         Stream<Entry<String, String>> in,
400         String elementName,
401         BiConsumer<AttributeKey, String> collectValueAttribute,
402         CldrDataType dtdType) {
403         Consumer<Entry<String, String>> collectValueAttributes = e -> {
404             if (dtdType.isValueAttribute(elementName, e.getKey())) {
405                 collectValueAttribute.accept(AttributeKey.keyOf(elementName, e.getKey()), e.getValue());
406             }
407         };
408         return in
409             // Special case of a synthetic distinguishing attribute that's _not_ in the DTD, and
410             // should be ignored.
411             .filter(e -> !e.getKey().equals(HIDDEN_SORT_INDEX_ATTRIBUTE))
412             .peek(collectValueAttributes)
413             .filter(e -> dtdType.isDistinguishingAttribute(elementName, e.getKey()));
414     }
415 
CldrPaths()416     private CldrPaths() {}
417 }
418