1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.cldr.api; 4 5 import static com.google.common.base.Preconditions.checkArgument; 6 import static com.google.common.base.Preconditions.checkNotNull; 7 import static com.google.common.base.Preconditions.checkPositionIndex; 8 import static com.google.common.base.Preconditions.checkState; 9 import static com.google.common.collect.ImmutableMap.toImmutableMap; 10 import static org.unicode.cldr.api.AttributeKey.keyOf; 11 12 import java.util.ArrayList; 13 import java.util.LinkedHashMap; 14 import java.util.List; 15 import java.util.Map; 16 import java.util.Optional; 17 import java.util.function.Predicate; 18 import java.util.regex.Matcher; 19 import java.util.regex.Pattern; 20 21 import com.google.common.collect.ImmutableList; 22 import com.google.common.collect.ImmutableMap; 23 24 /** 25 * An immutable matcher for {@link CldrPath} instances. 26 * 27 * <p>A path matcher pattern looks like: 28 * <ul> 29 * <li>{@code //ldml/path/prefix/element[@attribute="value"]} 30 * <li>{@code //ldml/path/prefix/element[@attribute=*]} 31 * <li>{@code //ldml/path/prefix/*[@attribute="value"]} 32 * <li>{@code //ldml/path/prefix/element[@foo=*]/*[@bar="value"]} 33 * </ul> 34 * where element names and attribute values can be wildcards. No capturing of wildcard values is 35 * done, and currently wildcards are limited to the entire field (element name, or attribute value). 36 * 37 * <h2>Matching an exact path</h2> 38 * <pre>{@code 39 * private static final PathMatcher ALIAS = 40 * PathMatcher.of("//supplementalData/likelySubtags/likelySubtag[@from=*][@to=*]"); 41 * 42 * ... 43 * 44 * if (ALIAS.matches(path)) { 45 * // Path represents a subtag alias. 46 * } 47 * }</pre> 48 * 49 * <h2>Matching path prefixes and hierarchical matching</h2> 50 * <pre>{@code 51 * private static final PathMatcher KEY = PathMatcher.of("//ldmlBCP47/keyword/key[@name=\"ca\"]"); 52 * private static final PathMatcher TYPE = key.withSuffix("type[@name=*]"); 53 * 54 * ... 55 * 56 * if (KEY.matchesPrefixOf(path)) { 57 * // Once the KEY prefix is matched, we can just "locally" match the suffix. 58 * if (TYPE.locallyMatches(path)) { 59 * // Path is a key-type mapping. 60 * } 61 * } 62 * }</pre> 63 */ 64 public final class PathMatcher { 65 private static final Pattern ROOT_PATH_SPEC = Pattern.compile("//(\\w+)(/|$)"); 66 67 /** 68 * Parses the full or partial (prefix) path pattern into a matcher. Path patterns given to this 69 * method must start with {@code "//xxx/..."} where {@code xxx} is a known CLDR XML path prefix 70 * (such as {@code "ldml"}). 71 */ of(String pattern)72 public static PathMatcher of(String pattern) { 73 Matcher m = ROOT_PATH_SPEC.matcher(pattern); 74 checkArgument(m.lookingAt(), "invalid path pattern: %s", pattern); 75 // This throws IllegalArgumentException if the type isn't valid. 76 CldrDataType.forXmlName(m.group(1)); 77 return new PathMatcher(null, parse(pattern.substring(2))); 78 } 79 80 private final Optional<PathMatcher> parent; 81 private final ImmutableList<Predicate<CldrPath>> elementMatchers; 82 private final int totalElementCount; 83 PathMatcher(PathMatcher parent, List<Predicate<CldrPath>> elementMatchers)84 private PathMatcher(PathMatcher parent, List<Predicate<CldrPath>> elementMatchers) { 85 this.parent = Optional.ofNullable(parent); 86 this.elementMatchers = ImmutableList.copyOf(elementMatchers); 87 this.totalElementCount = 88 elementMatchers.size() + (parent != null ? parent.totalElementCount : 0); 89 } 90 91 /** 92 * Extends this matcher by the given path pattern suffix. 93 * 94 * <p>For example if we define: 95 * <pre>{@code 96 * private static final PathMatcher COLLATIONS = PathMatcher.of("//ldml/collations"); 97 * private static final PathMatcher RULE = COLLATIONS.extendBy("collation/cr"); 98 * }</pre> 99 * Then {@code RULE} would match paths like {@code "//ldml/collations/collation/cr"}. 100 */ withSuffix(String pattern)101 public PathMatcher withSuffix(String pattern) { 102 return new PathMatcher(this, parse(pattern)); 103 } 104 105 /** 106 * Returns whether this matcher fully matches the given path, including taking into account 107 * any parent matchers from which this matcher might have been extended. 108 */ matches(CldrPath path)109 public boolean matches(CldrPath path) { 110 if (!locallyMatches(path)) { 111 return false; 112 } 113 if (!parent.isPresent()) { 114 // No parent means this is a "root" matcher, so we've finished matching everything. 115 return true; 116 } 117 while (path.getLength() > totalElementCount - elementMatchers.size()) { 118 path = path.getParent(); 119 } 120 return parent.get().locallyMatches(path); 121 } 122 123 /** 124 * Returns whether this matcher matches a proper prefix of the given path, including taking 125 * into account any parent matchers from which this matcher might have been extended. Note that 126 * the matcher for {@code "//ldml/foo/bar"} is not considered a prefix match for the path 127 * {@code //ldml/foo/barbaz}, since partial matching of an element name is not allowed. 128 */ matchesPrefixOf(CldrPath path)129 public boolean matchesPrefixOf(CldrPath path) { 130 if (path.getLength() < totalElementCount) { 131 return false; 132 } 133 while (path.getLength() > totalElementCount) { 134 path = path.getParent(); 135 } 136 return matches(path); 137 } 138 139 /** 140 * Returns whether this matcher matches the given path only in relation to the pattern 141 * used to generate this matcher. This method is useful for nested visitation via {@link 142 * org.unicode.cldr.api.CldrData.PrefixVisitor PrefixVisitor} when path prefixes are visited 143 * before values. 144 * 145 * <p>This method assumes that any parent matchers from which this matcher was extended already 146 * match the path. The possible length of the parent matchers is taken into account however to 147 * ensure that the sub-sequence match for this matcher starts at the correct point in the path. 148 */ locallyMatches(CldrPath path)149 public boolean locallyMatches(CldrPath path) { 150 return (path.getLength() == totalElementCount) 151 && matchRegion(path, totalElementCount - elementMatchers.size()); 152 } 153 matchRegion(CldrPath path, int offset)154 private boolean matchRegion(CldrPath path, int offset) { 155 // offset is the path element corresponding the the "top most" element matcher, it 156 // must be in the range 0 ... (path.length() - elementMatchers.size()). 157 checkPositionIndex(offset, path.getLength() - elementMatchers.size()); 158 // First jump over the path parents until we find the last matcher. 159 int matchPathLength = offset + elementMatchers.size(); 160 while (path.getLength() > matchPathLength) { 161 path = path.getParent(); 162 } 163 return matchForward(path, elementMatchers.size() - 1); 164 } 165 matchForward(CldrPath path, int matcherIndex)166 private boolean matchForward(CldrPath path, int matcherIndex) { 167 if (matcherIndex < 0) { 168 return true; 169 } 170 return matchForward(path.getParent(), matcherIndex - 1) 171 && elementMatchers.get(matcherIndex).test(path); 172 } 173 174 // --- Parsing of matcher path patterns ---- 175 176 // Make a new, non-interned, unique instance here which we can test by reference to 177 // determine if the argument is to be captured (needed as ImmutableMap prohibits null). 178 // DO NOT change this code to assign "*" as the value directly, it MUST be a new instance. 179 @SuppressWarnings("StringOperationCanBeSimplified") 180 private static final String WILDCARD = new String("*"); 181 182 private static final Pattern ELEMENT_START_REGEX = 183 Pattern.compile("(\\*|[-:\\w]+)(?:/|\\[|$)"); 184 private static final Pattern ATTRIBUTE_REGEX = 185 Pattern.compile("\\[@([-:\\w]+)=(?:\\*|\"([^\"]*)\")]"); 186 187 // element := foo, foo[@bar="baz"], foo[@bar=*] 188 // pathspec := element{/element}* parse(String pattern)189 private static List<Predicate<CldrPath>> parse(String pattern) { 190 List<Predicate<CldrPath>> specs = new ArrayList<>(); 191 int pos = 0; 192 do { 193 pos = parse(pattern, pos, specs); 194 } while (pos >= 0); 195 return specs; 196 } 197 198 // Return next start index or -1. parse(String pattern, int pos, List<Predicate<CldrPath>> specs)199 private static int parse(String pattern, int pos, List<Predicate<CldrPath>> specs) { 200 Matcher m = ELEMENT_START_REGEX.matcher(pattern).region(pos, pattern.length()); 201 checkArgument(m.lookingAt(), "invalid path pattern (index=%s): %s", pos, pattern); 202 String name = m.group(1); 203 Map<String, String> attributes = ImmutableMap.of(); 204 pos = m.end(1); 205 if (pos < pattern.length() && pattern.charAt(pos) == '[') { 206 // We have attributes to add. 207 attributes = new LinkedHashMap<>(); 208 do { 209 m = ATTRIBUTE_REGEX.matcher(pattern).region(pos, pattern.length()); 210 checkArgument(m.lookingAt(), 211 "invalid path pattern (index=%s): %s", pos, pattern); 212 // Null if we matched the '*' wildcard. 213 String value = m.group(2); 214 attributes.put(m.group(1), value != null ? value : WILDCARD); 215 pos = m.end(); 216 } while (pos < pattern.length() && pattern.charAt(pos) == '['); 217 } 218 // Wildcard matching is less efficient because attribute keys cannot be made in advance, so 219 // since it's also very rare, we special case it. 220 Predicate<CldrPath> matcher = name.equals(WILDCARD) 221 ? new WildcardElementMatcher(attributes)::match 222 : new ElementMatcher(name, attributes)::match; 223 specs.add(matcher); 224 if (pos == pattern.length()) { 225 return -1; 226 } 227 checkState(pattern.charAt(pos) == '/', 228 "invalid path pattern (index=%s): %s", pos, pattern); 229 return pos + 1; 230 } 231 232 // Matcher for path elements like "foo[@bar=*]" where the name is known in advance. 233 private static final class ElementMatcher { 234 private final String name; 235 private final ImmutableMap<AttributeKey, String> attributes; 236 ElementMatcher(String name, Map<String, String> attributes)237 private ElementMatcher(String name, Map<String, String> attributes) { 238 this.name = checkNotNull(name); 239 this.attributes = attributes.entrySet().stream() 240 .collect(toImmutableMap(e -> keyOf(name, e.getKey()), Map.Entry::getValue)); 241 } 242 243 @SuppressWarnings("StringEquality") match(CldrPath path)244 boolean match(CldrPath path) { 245 if (!path.getName().equals(name)) { 246 return false; 247 } 248 for (Map.Entry<AttributeKey, String> e : attributes.entrySet()) { 249 String actual = path.get(e.getKey()); 250 if (actual == null) { 251 return false; 252 } 253 String expected = e.getValue(); 254 // DO NOT change this to use expected.equals(WILDCARD). 255 if (expected != WILDCARD && !expected.equals(actual)) { 256 return false; 257 } 258 } 259 return true; 260 } 261 } 262 263 // Matcher for path elements like "*[@bar=*]", where the name isn't known until match time. 264 private static final class WildcardElementMatcher { 265 private final ImmutableMap<String, String> attributes; 266 WildcardElementMatcher(Map<String, String> attributes)267 private WildcardElementMatcher(Map<String, String> attributes) { 268 this.attributes = ImmutableMap.copyOf(attributes); 269 } 270 271 @SuppressWarnings("StringEquality") match(CldrPath path)272 private boolean match(CldrPath path) { 273 // The wildcard matcher never fails due to the element name but must create new key 274 // instances every time matching occurs (because the key name is dynamic). Since this 275 // is rare, it's worth making into a separate case. 276 for (Map.Entry<String, String> attribute : attributes.entrySet()) { 277 String actual = path.get(keyOf(path.getName(), attribute.getKey())); 278 if (actual == null) { 279 return false; 280 } 281 String expected = attribute.getValue(); 282 // DO NOT change this to use expected.equals(WILDCARD). 283 if (expected != WILDCARD && !expected.equals(actual)) { 284 return false; 285 } 286 } 287 return true; 288 } 289 } 290 } 291