1 // © 2019 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 package org.unicode.icu.tool.cldrtoicu.regex; 4 5 import static com.google.common.base.CharMatcher.whitespace; 6 import static com.google.common.base.Preconditions.checkArgument; 7 import static com.google.common.base.Preconditions.checkElementIndex; 8 import static com.google.common.base.Preconditions.checkNotNull; 9 import static com.google.common.base.Preconditions.checkState; 10 import static com.google.common.collect.ImmutableList.toImmutableList; 11 import static java.util.Comparator.comparing; 12 import static java.util.Comparator.nullsLast; 13 import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath; 14 15 import java.util.ArrayList; 16 import java.util.Comparator; 17 import java.util.List; 18 import java.util.Map; 19 import java.util.Objects; 20 import java.util.Optional; 21 import java.util.function.BiFunction; 22 import java.util.function.Function; 23 import java.util.regex.Matcher; 24 import java.util.regex.Pattern; 25 import java.util.stream.Stream; 26 27 import org.unicode.cldr.api.CldrPath; 28 import org.unicode.cldr.api.CldrValue; 29 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars; 30 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result; 31 import org.unicode.icu.tool.cldrtoicu.RbPath; 32 33 import com.google.common.base.Splitter; 34 import com.google.common.collect.ImmutableList; 35 import com.google.common.collect.ImmutableMap; 36 import com.google.common.collect.Lists; 37 38 /** 39 * A specification for building a result from the arguments in a matched xpath. Results always 40 * hold a reference to their originating specification to allow them to be ordered in the same 41 * order as the corresponding specifications in the configuration file. 42 */ 43 final class ResultSpec { 44 // Subtle ordering for results to ensure "config file order" for things in the same 45 // resource bundle while being "friendly" towards a global ordering. This is NOT consistent 46 // with equals if duplicate results exist. 47 // 48 // This is ESSENTIAL for correct grouping and ordering within resource bundles. 49 // 50 // In normal use this is expected only to be used to reorder results within a resource 51 // bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles 52 // themselves can just be managed in "visitation order" or similar. 53 // 54 // Ordering priority is: 55 // 1: Result key (resource bundle): Groups results by resource bundle. 56 // 2: Result specification line number: Orders resource bundle contents by "file order". 57 // 3: Result distinguishing xpath: Tie breaking if duplicates are not yet removed. 58 // 59 // Note that the currently uses the String representation of the resource bundle path (key) 60 // as the primary order to match legacy behaviour. However it would be better to use the 61 // natural lexicographical RbPath order (the difference relates to having '/' as the 62 // separator in the string representation of the path). The string form of a path is a bad 63 // choice because some paths can contain a literal '/', which makes ordering problematic in 64 // rare case. However changing this will have the effect of reodering path elements, which 65 // while it should be safe, must be done with caution. 66 // TODO: Fix this to use RbPath ordering and NOT the String representation 67 private static final Comparator<AbstractResult> RESULT_ORDERING = 68 Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString()) 69 .thenComparing(r -> r.getSpec().lineNumber) 70 .thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null)))); 71 72 // Splitter for any values (either in CLDR data or results specifications). The only time 73 // values are split differently is when quoting exists in the "values" instruction. 74 private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings(); 75 76 // Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list. 77 private static final Pattern FUNCTION = Pattern.compile("&(\\w++)\\(([^)]++)\\)"); 78 79 // Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it 80 // appears in the configuration file. 81 private final String rbPathSpec; 82 83 // Declared instructions with which to generate result values (see Instruction). 84 private final ImmutableMap<Instruction, VarString> instructions; 85 86 // This index of the xpath argument whose value should be split to create multiple results. 87 // This mechanism is used when an xpath attribute is a space separated list of values and 88 // one result should be created for each value (e.g. [@territories="AA BB CC"] but you want 89 // a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar"). 90 // At most one argument is ever split (corresponding to the first unquoted placeholder in 91 // the resource bundle path specification). 92 private final int splitArgIndex; 93 94 // The line number of the result specification in the file which defines the ordering of 95 // results within a resource bundle. This needn't be a line number, but must be unique for 96 // each specification. 97 private final int lineNumber; 98 99 // The named functions available to the parser. Ideally the rules and result specifications 100 // would be an inner class of some kind of context/environment and just share this. 101 private final ImmutableMap<String, NamedFunction> icuFunctions; 102 103 // The map of dynamic variables (looked up from CldrPaths when a rule is resolved. 104 private final Function<Character, CldrPath> dynamicVarFn; 105 ResultSpec( String rbPathSpec, Map<Instruction, VarString> instructions, int lineNumber, Map<String, NamedFunction> icuFunctions, Function<Character, CldrPath> dynamicVarFn)106 ResultSpec( 107 String rbPathSpec, 108 Map<Instruction, VarString> instructions, 109 int lineNumber, 110 Map<String, NamedFunction> icuFunctions, 111 Function<Character, CldrPath> dynamicVarFn) { 112 this.rbPathSpec = checkNotNull(rbPathSpec); 113 this.instructions = ImmutableMap.copyOf(instructions); 114 this.splitArgIndex = getSplitArgIndex(rbPathSpec); 115 this.lineNumber = lineNumber; 116 this.icuFunctions = ImmutableMap.copyOf(icuFunctions); 117 this.dynamicVarFn = checkNotNull(dynamicVarFn); 118 } 119 120 /** 121 * Transforms a path/value into a sequence of results. The given matcher has successfully 122 * matched the path and contains the captured arguments corresponding to $1..$N in the 123 * various result specification strings. 124 */ transform( CldrValue value, Matcher m, DynamicVars varLookupFn)125 Stream<Result> transform( 126 CldrValue value, Matcher m, DynamicVars varLookupFn) { 127 // Discard group(0) since that's always the full xpath that was matched, and we don't 128 // need that any more (so "$N" is args.get(N - 1)). 129 List<String> args = new ArrayList<>(); 130 for (int i = 1; i <= m.groupCount(); i++) { 131 // Important since we turn this into an ImmutableList (which is null-hostile). 132 args.add(checkNotNull(m.group(i), 133 "captured regex arguments must always be present\n" 134 + "(use an non-capturing groups for optional arguments): %s", m.pattern())); 135 } 136 137 // The first unquoted argument in any resource bundle path declaration, is defined as 138 // being "splittable". Typically this happens if the value of the captured xpath 139 // argument is expected to be a list of items. 140 // 141 // In this case, we generate one result for each individual argument, replacing the 142 // appropriate captured list with each split value in turn. Thus with original 143 // arguments: 144 // ["foo", "bar baz", "quux"] 145 // where splitArgIndex == 1, we get two results using the argument lists: 146 // ["foo", "bar", "quux"] 147 // ["foo", "baz", "quux"] 148 // 149 // Note also that since the splittability of the arguments is technically defined 150 // by the resource bundle path specification (not the xpath regular expression) it 151 // could differ per ResultSpec instance (but currently never does). 152 if (splitArgIndex != -1) { 153 List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex)); 154 // Only bother if there was more than one argument there anyway. 155 if (splitArgs.size() > 1) { 156 return splitArgs.stream().map(a -> { 157 args.set(splitArgIndex, a); 158 return matchedResult(value, args, varLookupFn); 159 }); 160 } 161 } 162 // No splittable argument, or a splittable argument with only one value. 163 return Stream.of(matchedResult(value, args, varLookupFn)); 164 } 165 166 // Simple helper to make results. matchedResult( CldrValue value, List<String> args, DynamicVars varLookupFn)167 private Result matchedResult( 168 CldrValue value, List<String> args, DynamicVars varLookupFn) { 169 return new MatchedResult( 170 getRbPath(args), 171 getValues(value.getValue(), args), 172 getResultPath(value.getPath(), args, varLookupFn)); 173 } 174 175 // Resource bundle paths are a bit special (unsurprisingly). The captured arguments can 176 // contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as 177 // "foo/x/y/bar" after argument substitution. 178 // 179 // However (a hack for timezone "metazone" paths) if the argument placeholder is quoted 180 // (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained 181 // (e.g. "foo/"x:y"/bar). 182 // TODO: Replace hard coded hack here with an explicit function in the config file. getRbPath(List<String> args)183 private RbPath getRbPath(List<String> args) { 184 // Without more careful parsing, it's hard to figure out it quotes in a resource bundle 185 // path specification are around a placeholder or not. Since quotes are only used in a 186 // small number of cases currently, and only for this purpose, we just assume that any 187 // quotes in the path specification should trigger this behaviour. 188 if (rbPathSpec.contains("\"")) { 189 // Use a lazy transforming list to avoid char replacement in arguments that don't 190 // appear in the resource bundle path. 191 args = Lists.transform(args, s -> s.replace('/', ':')); 192 } 193 String path = substituteArgs(rbPathSpec, args); 194 return RbPath.parse(path); 195 } 196 197 // Create an array of output values according to the CLDR value (if present) and the 198 // "values" instruction in the result specification (if present). Any functions present in 199 // the "values" instruction are invoked here. getValues(String value, List<String> args)200 private ImmutableList<String> getValues(String value, List<String> args) { 201 VarString valuesSpec = instructions.get(Instruction.VALUES); 202 if (valuesSpec == null) { 203 // No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR 204 // value use "values={value}" in the result specification. 205 return ImmutableList.of(value); 206 } 207 // The "value" instruction is not expected to have any dynamic %N variables in it, 208 // since those only represent CLDR path mappings, which should not be directly present 209 // in the ICU data. Hence the valueSpec should have been fully resolved by the static 210 // variables applied earlier and we should just need to resolve() it into a String. 211 String resolved = valuesSpec.get(); 212 213 // First substitute the $N arguments in since they need to be passed to the 214 // functions. 215 // 216 // WARNING: This doesn't strictly work, since an argument or function result could 217 // (in theory) contain the string "{value}" which would then be substituted in an 218 // unexpected way. The better way to do this is with a single pass which handles 219 // arguments, function calling and the special "{value}" token together. This comes 220 // down to the fact that the mapping file syntax doesn't have a well defined concept 221 // of escaping or invocation order. 222 // TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent. 223 resolved = substituteArgs(resolved, args); 224 225 Matcher m = FUNCTION.matcher(resolved); 226 if (m.find()) { 227 StringBuilder buffer = new StringBuilder(); 228 int index = 0; 229 do { 230 // Append up to the start of the function call. 231 buffer.append(resolved, index, m.start()); 232 233 // Replace '{value}' here so functions can be called with the CLDR value as well 234 // as captured path arguments. We also have to replace it below, which is all a bit 235 // dodgy if a function every returned '{value}'. 236 NamedFunction fn = icuFunctions.get(m.group(1)); 237 checkArgument(fn != null, "no such function: %s", m.group(1)); 238 buffer.append(fn.call(m.group(2).replace("{value}", value))); 239 index = m.end(); 240 } while (m.find()); 241 resolved = buffer.append(resolved.substring(index)).toString(); 242 } 243 // Having done function invocation, we handle the special "{value}" token and split 244 // the value (taking quoting into account). 245 return splitValues(resolved.replace("{value}", value)); 246 } 247 248 // IMPORTANT: The path of a result is either: 249 // * The original distinguishing path 250 // * The specified "base_xpath" (which must also be a distinguishing xpath). 251 // and this is used as part of the equality semantics (which are very subtle). 252 // 253 // The existence of "base_xpath" is a hack to get around the fact the xpaths can only be 254 // matched in full, rather than by a prefix. For some cases this means that the "same" 255 // result will be created many times by potentially different distinguishing xpaths, 256 // perhaps even via different result specifications. "base_xpath" exists as a hack to give 257 // these duplicate results the same "fake" xpath, so deduplication can occur. getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn)258 private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) { 259 VarString basePath = instructions.get(Instruction.BASE_XPATH); 260 if (basePath == null) { 261 return path; 262 } 263 String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get(); 264 return parseDistinguishingPath(substituteArgs(resolvedBasePath, args)); 265 } 266 267 /** 268 * Returns a fallback function if this specification has the "fallback=" instruction. 269 * The function takes a resolved resource bundle path and returns the possible fallback 270 * values for it. Note that currently fallback values do not support either quoting or 271 * grouping (but they easily could). 272 */ getFallbackFunction()273 Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() { 274 VarString fallbackSpec = instructions.get(Instruction.FALLBACK); 275 if (fallbackSpec == null) { 276 return Optional.empty(); 277 } 278 // This is the only place where any hacking of regular expressions occurs. The fallback 279 // function must only return a value if the given resolved resource bundle path could 280 // have been a match for the path specification. 281 // 282 // In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which 283 // should not both be matched, we explicitly disallow '/' in argument values. In theory 284 // this is problematic, since '/' should be an allowed character, but the issues caused 285 // by ambiguous matching are worse. 286 // TODO: Fix/replace all of this fallback mess with something cleaner. 287 Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec); 288 289 // Another, frankly terrifying, bit of hackery to support fallback specifications with 290 // $N argument substitution (this currently only happens once, but must be supported). 291 // Just another reason to want to replace the current fallback mechanism. 292 fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec); 293 294 // Just copying here to make it effectively final. 295 VarString finalFallbackSpec = fallbackSpec; 296 return Optional.of( 297 (p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec)); 298 } 299 getFallbackResult( RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec)300 private Optional<Result> getFallbackResult( 301 RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) { 302 // Check is the given rbPath could be associated with this fallback (most are not). 303 Matcher matcher = rbPathMatcher.matcher(rbPath.toString()); 304 if (!matcher.matches()) { 305 return Optional.empty(); 306 } 307 // Expect that once any dynamic variables are provided to the fallback specification, 308 // we can get the resolved fallback specification (potentially with $N placeholders to 309 // be filled in from the resource bundle path). 310 String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get(); 311 if (matcher.groupCount() > 0) { 312 specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount()); 313 } 314 315 // Split the fallback value _without_ considering quoting. This matches the original 316 // behaviour but could cause all sorts of subtle issues if values contained quotes. 317 // TODO: Rework transformation rules to make quoting behaviour deterministic. 318 Iterable<String> values = 319 VALUE_SPLITTER.splitToList(specStr).stream() 320 // Fallback values that "look like" CLDR paths are auto-magically resolved. 321 .map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v) 322 .collect(toImmutableList()); 323 return Optional.of(new FallbackResult(rbPath, values)); 324 } 325 326 // WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument 327 // substitutions are allowed in fallback values. This is highly problematic because 328 // since the fallback value must be synthesized only from the resource bundle path, 329 // there's no way for this substitution to handle: 330 // 1: multi-valued list arguments 331 // 2: arguments that didn't appear in the resource bundle path 332 // 3: dynamic path variables (e.g. %D=//some/path) 333 // 334 // An example would be something like a resource bundle specification of: 335 // /Baz/$2/$1 336 // and a fallback value of: 337 // Foo$1/Bar$2 338 // 339 // Here the order of substitution is not maintained and the original path specification 340 // has values that are not naturally ordered (or possibly even duplicated). The pattern 341 // we calculate from the resource bundle path specification will match/capture groups in 342 // "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the 343 // placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1"). 344 // TODO: Figure out a way to remove all of this extreme complexity. maybeRewriteFallbackSpec( VarString fallbackSpec)345 private VarString maybeRewriteFallbackSpec( 346 VarString fallbackSpec) { 347 Optional<String> fallback = fallbackSpec.resolve(); 348 // If the fallback string is not present, it's because the VarString still has 349 // unresolved "dynamic" variables for late binding. This is okay, but should not 350 // be mixed with argument substitution. 351 if (!fallback.isPresent() || !fallback.get().contains("$")) { 352 return fallbackSpec; 353 } 354 // After the quick rejection check for '$', do a proper search for $N variables (since 355 // '$' is permitted as a literal if not followed by a digit). 356 Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get()); 357 if (!fallbackMatcher.find()) { 358 return fallbackSpec; 359 } 360 361 // Fallback spec has $N in it, triggering super hacky behaviour. 362 Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec); 363 checkState(pathMatcher.find(), 364 "$N arguments in fallback must be present in the resource bundle path: %s", 365 rbPathSpec); 366 // Explicit group characters ("1"..."9") in the order they appear in the 367 // resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1"). 368 List<Character> groupIds = new ArrayList<>(); 369 do { 370 groupIds.add(pathMatcher.group().charAt(1)); 371 } while (pathMatcher.find()); 372 373 // Special check to avoid a horrible bug if we every had more than 9 distinct 374 // placeholders (essentially impossible with current data). If it did happen, 375 // the returned index below would be >= 9 and we would get "$X", where 'X' was 376 // not a numeric value. 377 checkState(groupIds.size() < 10, 378 "too many placeholders in resource bundle path: %s", rbPathSpec); 379 380 // Now find each placeholder in the fallback specification string and map it to 381 // the equivalent index for the path matcher we just created. 382 StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get()); 383 do { 384 int placeholderPos = fallbackMatcher.start() + 1; 385 // The new ID is the index of the corresponding placeholder offset by '1'. 386 char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos); 387 int newPlaceholderIndex = groupIds.indexOf(placeholderDigit); 388 checkState(newPlaceholderIndex != -1, 389 "fallback values may only contain arguments from the resource bundle path: %s", 390 fallback.get()); 391 rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex)); 392 } while (fallbackMatcher.find()); 393 return VarString.of(rewrittenFallbackSpec.toString()); 394 } 395 396 /** Base class of either a matched or a fallback result. */ 397 private abstract class AbstractResult extends Result { 398 // Split and resolved values for this result (see also "isGrouped()"). 399 private final ImmutableList<String> values; 400 401 // The "source" CLDR path of a matched result (omitted if this is a fallback result). 402 // Note that this is the resolved "base_xpath" if it was specified in the instructions. 403 private final Optional<CldrPath> basePath; 404 405 // Calculated eagerly since we always expect results to need to be deduplicated. 406 private final int hashCode; 407 408 AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) { 409 super(key); 410 this.values = ImmutableList.copyOf(values); 411 this.basePath = checkNotNull(path); 412 // Same attributes in the same order as tested for in equals(). 413 this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues()); 414 } 415 416 // Returns the specification from which this result was obtained. This is essential for 417 // correct ordering and determining fallback values, but is not directly used for 418 // determining result equality (since duplicate results can be generated by different 419 // specifications). 420 final ResultSpec getSpec() { 421 return ResultSpec.this; 422 } 423 424 final Optional<CldrPath> getPath() { 425 return basePath; 426 } 427 428 final boolean wasMatched() { 429 // We could also do this via a boolean field. 430 return this instanceof MatchedResult; 431 } 432 433 @Override 434 public final ImmutableList<String> getValues() { 435 return values; 436 } 437 438 @Override 439 public final int compareTo(Result other) { 440 checkArgument(other instanceof AbstractResult, 441 "unknown result type: %s", other.getClass()); 442 return RESULT_ORDERING.compare(this, (AbstractResult) other); 443 } 444 445 @Override 446 public final int hashCode() { 447 return hashCode; 448 } 449 450 // Equality semantics of results is ESSENTIAL for correct behaviour, especially the 451 // deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING. 452 @Override 453 public final boolean equals(Object obj) { 454 // Different subclasses are never equal, so test class directly (not instanceof). 455 if (obj == null || !getClass().equals(obj.getClass())) { 456 return false; 457 } 458 AbstractResult other = (AbstractResult) obj; 459 // DO NOT test the result specifier here. Equal results can be generated from 460 // different result specifications (if "base_xpath" was used). 461 return getKey().equals(other.getKey()) 462 && getPath().equals(other.getPath()) 463 && isGrouped() == other.isGrouped() 464 // Alternatively assert that values are equal if everything else is. 465 && getValues().equals(other.getValues()); 466 } 467 } 468 469 // Result created for an explicit path match using captured arguments. 470 private final class MatchedResult extends AbstractResult { 471 MatchedResult(RbPath key, Iterable<String> values, CldrPath path) { 472 super(key, values, Optional.of(path)); 473 } 474 475 @Override 476 public boolean isGrouped() { 477 // We don't need to use the "group" value at all and it can be removed from the 478 // configuration file at some point. 479 return instructions.containsKey(Instruction.GROUP); 480 } 481 482 @Override 483 public boolean isFallbackFor(Result r) { 484 // Matched results are never a fallback for anything. 485 return false; 486 } 487 } 488 489 // Result created to hold possible fallback values for a specified resource bundle path. 490 private final class FallbackResult extends AbstractResult { 491 FallbackResult(RbPath rbPath, Iterable<String> values) { 492 super(rbPath, values, Optional.empty()); 493 } 494 495 // Delete this method and move the other one into AbstractResult if we decide to allow 496 // grouping for fallback values (it's not clear if it's a good idea). 497 @Override 498 public boolean isGrouped() { 499 return false; 500 } 501 502 @Override 503 public boolean isFallbackFor(Result r) { 504 // We are a fallback if we came from the same specification as a matched result. 505 // To prevent duplication of fallback results, we also return true if the result we 506 // are "equal()" to the given result (equivalent fallback results can come from 507 // different input paths). 508 checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r); 509 AbstractResult result = (AbstractResult) r; 510 return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result); 511 } 512 } 513 514 // ==== Static helper functions ==== 515 516 // Matches any "$N" placeholder without capturing. 517 private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]"); 518 519 // Turn "$N" into a capturing groups. 520 // 521 // Note that this code currently assumes that each "$N" placeholder matches a single path 522 // segment (i.e. the captured values cannot contain '/'). This is an artificial restriction 523 // since resource bundle paths can have quoting in, so we could detect quoted placeholders 524 // and allow any characters. However at the moment this isn't an issue, and none of the 525 // "$N" placeholders in the paths expects to match anything with '/' in. 526 // 527 // TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly. 528 private static Pattern getRbPathMatcher(String rbPathSpec) { 529 // An RbPath instance's toString() does not have a leading '/' on it, so well have to 530 // account for that here (or we could just remove the leading '/' from paths in the 531 // config file... 532 if (rbPathSpec.startsWith("/")) { 533 rbPathSpec = rbPathSpec.substring(1); 534 } 535 // Protect potential regex meta-characters in the original resource bundle path. Using 536 // '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that 537 // means we also need to handle '\E' in the original string (incredibly unlikely but it 538 // would be super hard to debug if it ever happened). 539 // TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify. 540 String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E"; 541 542 // Remember that you could get "$1$2" here and the regex groups that replace them will 543 // abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case. 544 // We assume that the substituted arguments contained at least one character, and so we 545 // capture at least one character per group here. 546 regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q"); 547 return Pattern.compile(regex); 548 } 549 550 private static String substituteArgs(String spec, List<String> args) { 551 return substituteArgs(spec, args::get, args.size()); 552 } 553 554 // Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed 555 // function (i.e. "$N" --> args(N - 1)). 556 private static String substituteArgs(String spec, Function<Integer, String> args, int size) { 557 return RegexTransformer.substitute( 558 spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index"))); 559 } 560 561 // Matches arguments with or without enclosing quotes. 562 private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?"); 563 564 // Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9) 565 // and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is 566 // captured by the regex because it's just the entire path. getSplitArgIndex(String rbPath)567 private static int getSplitArgIndex(String rbPath) { 568 // Captures a $N placeholder, but might catch surrounding quoting as well. 569 Matcher matcher = ARGUMENT.matcher(rbPath); 570 while (matcher.find()) { 571 char startChar = rbPath.charAt(matcher.start()); 572 char endChar = rbPath.charAt(matcher.end() - 1); 573 // Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N". 574 // Q: Why two different "quoting" schemes? 575 // A: It's complex and relates the something called "hidden labels". 576 boolean shouldSplit = !((startChar == '"' && endChar == '"') || 577 (startChar == '<' && endChar == '>')); 578 if (shouldSplit) { 579 // Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and 580 // arguments are zero-indexed, so we expect an index from 0 to 8. 581 int groupNumber = Integer.parseInt(matcher.group(1)); 582 checkArgument(groupNumber >= 1 && groupNumber <= 9, 583 "invalid split argument: %s", groupNumber); 584 return groupNumber - 1; 585 } 586 } 587 return -1; 588 } 589 590 // Splits a possibly quoted string, where we need to handle \". This is a bit dubious 591 // though as we don't detect or unescape \\. Thus it's impossible to represent a single '\' 592 // at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'. 593 // It's also impossible to have a value that should be split but which contains '"'. 594 // 595 // This mimics the original RegexManager behaviour where spaces in and quotes in 596 // substituted values are _not_ escaped. splitValues(String value)597 private static ImmutableList<String> splitValues(String value) { 598 int qstart = nextBareQuoteIndex(value, 0); 599 if (qstart == -1) { 600 return ImmutableList.copyOf(VALUE_SPLITTER.split(value)); 601 } 602 ImmutableList.Builder<String> values = ImmutableList.builder(); 603 int rawStart = 0; 604 do { 605 values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart))); 606 int qend = nextBareQuoteIndex(value, qstart + 1); 607 checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value); 608 // Remember to unescape any '"' found in the quoted regions. 609 values.add(value.substring(qstart + 1, qend).replace("\\\"", "\"")); 610 rawStart = qend + 1; 611 qstart = nextBareQuoteIndex(value, qend + 1); 612 } while (qstart != -1); 613 values.addAll(VALUE_SPLITTER.split(value.substring(rawStart))); 614 return values.build(); 615 } 616 617 // Returns the index of the next '"' character that's not preceded by a '\'. nextBareQuoteIndex(String s, int i)618 private static int nextBareQuoteIndex(String s, int i) { 619 i = s.indexOf('"', i); 620 // If i == 0, then '"' is the first char and must be "bare". 621 if (i > 0) { 622 do { 623 if (s.charAt(i - 1) != '\\') { 624 break; 625 } 626 i = s.indexOf('\\', i + 1); 627 } while (i >= 0); 628 } 629 return i; 630 } 631 } 632