• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2019 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 package org.unicode.icu.tool.cldrtoicu.regex;
4 
5 import static com.google.common.base.CharMatcher.whitespace;
6 import static com.google.common.base.Preconditions.checkArgument;
7 import static com.google.common.base.Preconditions.checkElementIndex;
8 import static com.google.common.base.Preconditions.checkNotNull;
9 import static com.google.common.base.Preconditions.checkState;
10 import static com.google.common.collect.ImmutableList.toImmutableList;
11 import static java.util.Comparator.comparing;
12 import static java.util.Comparator.nullsLast;
13 import static org.unicode.cldr.api.CldrPath.parseDistinguishingPath;
14 
15 import java.util.ArrayList;
16 import java.util.Comparator;
17 import java.util.List;
18 import java.util.Map;
19 import java.util.Objects;
20 import java.util.Optional;
21 import java.util.function.BiFunction;
22 import java.util.function.Function;
23 import java.util.regex.Matcher;
24 import java.util.regex.Pattern;
25 import java.util.stream.Stream;
26 
27 import org.unicode.cldr.api.CldrPath;
28 import org.unicode.cldr.api.CldrValue;
29 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.DynamicVars;
30 import org.unicode.icu.tool.cldrtoicu.PathValueTransformer.Result;
31 import org.unicode.icu.tool.cldrtoicu.RbPath;
32 
33 import com.google.common.base.Splitter;
34 import com.google.common.collect.ImmutableList;
35 import com.google.common.collect.ImmutableMap;
36 import com.google.common.collect.Lists;
37 
38 /**
39  * A specification for building a result from the arguments in a matched xpath. Results always
40  * hold a reference to their originating specification to allow them to be ordered in the same
41  * order as the corresponding specifications in the configuration file.
42  */
43 final class ResultSpec {
44     // Subtle ordering for results to ensure "config file order" for things in the same
45     // resource bundle while being "friendly" towards a global ordering. This is NOT consistent
46     // with equals if duplicate results exist.
47     //
48     // This is ESSENTIAL for correct grouping and ordering within resource bundles.
49     //
50     // In normal use this is expected only to be used to reorder results within a resource
51     // bundle (i.e. those sharing the same resource bundle path "key"). Resource bundles
52     // themselves can just be managed in "visitation order" or similar.
53     //
54     // Ordering priority is:
55     // 1: Result key (resource bundle):     Groups results by resource bundle.
56     // 2: Result specification line number: Orders resource bundle contents by "file order".
57     // 3: Result distinguishing xpath:      Tie breaking if duplicates are not yet removed.
58     //
59     // Note that the currently uses the String representation of the resource bundle path (key)
60     // as the primary order to match legacy behaviour. However it would be better to use the
61     // natural lexicographical RbPath order (the difference relates to having '/' as the
62     // separator in the string representation of the path). The string form of a path is a bad
63     // choice because some paths can contain a literal '/', which makes ordering problematic in
64     // rare case. However changing this will have the effect of reodering path elements, which
65     // while it should be safe, must be done with caution.
66     // TODO: Fix this to use RbPath ordering and NOT the String representation
67     private static final Comparator<AbstractResult> RESULT_ORDERING =
68         Comparator.<AbstractResult, String>comparing(r -> r.getKey().toString())
69             .thenComparing(r -> r.getSpec().lineNumber)
70             .thenComparing(nullsLast(comparing(r -> r.getPath().orElse(null))));
71 
72     // Splitter for any values (either in CLDR data or results specifications). The only time
73     // values are split differently is when quoting exists in the "values" instruction.
74     private static final Splitter VALUE_SPLITTER = Splitter.on(whitespace()).omitEmptyStrings();
75 
76     // Matcher for "&foo_bar(a,b,c)" which captures function name and complete argument list.
77     private static final Pattern FUNCTION = Pattern.compile("&(\\w++)\\(([^)]++)\\)");
78 
79     // Resource bundle path specification with placeholders (e.g. "/foo/$1/bar") exactly as it
80     // appears in the configuration file.
81     private final String rbPathSpec;
82 
83     // Declared instructions with which to generate result values (see Instruction).
84     private final ImmutableMap<Instruction, VarString> instructions;
85 
86     // This index of the xpath argument whose value should be split to create multiple results.
87     // This mechanism is used when an xpath attribute is a space separated list of values and
88     // one result should be created for each value (e.g. [@territories="AA BB CC"] but you want
89     // a resource bundle for each region code (e.g. "foo/XX/bar", "foo/YY/bar", "foo/ZZ/bar").
90     // At most one argument is ever split (corresponding to the first unquoted placeholder in
91     // the resource bundle path specification).
92     private final int splitArgIndex;
93 
94     // The line number of the result specification in the file which defines the ordering of
95     // results within a resource bundle. This needn't be a line number, but must be unique for
96     // each specification.
97     private final int lineNumber;
98 
99     // The named functions available to the parser. Ideally the rules and result specifications
100     // would be an inner class of some kind of context/environment and just share this.
101     private final ImmutableMap<String, NamedFunction> icuFunctions;
102 
103     // The map of dynamic variables (looked up from CldrPaths when a rule is resolved.
104     private final Function<Character, CldrPath> dynamicVarFn;
105 
ResultSpec( String rbPathSpec, Map<Instruction, VarString> instructions, int lineNumber, Map<String, NamedFunction> icuFunctions, Function<Character, CldrPath> dynamicVarFn)106     ResultSpec(
107         String rbPathSpec,
108         Map<Instruction, VarString> instructions,
109         int lineNumber,
110         Map<String, NamedFunction> icuFunctions,
111         Function<Character, CldrPath> dynamicVarFn) {
112         this.rbPathSpec = checkNotNull(rbPathSpec);
113         this.instructions = ImmutableMap.copyOf(instructions);
114         this.splitArgIndex = getSplitArgIndex(rbPathSpec);
115         this.lineNumber = lineNumber;
116         this.icuFunctions = ImmutableMap.copyOf(icuFunctions);
117         this.dynamicVarFn = checkNotNull(dynamicVarFn);
118     }
119 
120     /**
121      * Transforms a path/value into a sequence of results. The given matcher has successfully
122      * matched the path and contains the captured arguments corresponding to $1..$N in the
123      * various result specification strings.
124      */
transform( CldrValue value, Matcher m, DynamicVars varLookupFn)125     Stream<Result> transform(
126         CldrValue value, Matcher m, DynamicVars varLookupFn) {
127         // Discard group(0) since that's always the full xpath that was matched, and we don't
128         // need that any more (so "$N" is args.get(N - 1)).
129         List<String> args = new ArrayList<>();
130         for (int i = 1; i <= m.groupCount(); i++) {
131             // Important since we turn this into an ImmutableList (which is null-hostile).
132             args.add(checkNotNull(m.group(i),
133                 "captured regex arguments must always be present\n"
134                     + "(use an non-capturing groups for optional arguments): %s", m.pattern()));
135         }
136 
137         // The first unquoted argument in any resource bundle path declaration, is defined as
138         // being "splittable". Typically this happens if the value of the captured xpath
139         // argument is expected to be a list of items.
140         //
141         // In this case, we generate one result for each individual argument, replacing the
142         // appropriate captured list with each split value in turn. Thus with original
143         // arguments:
144         //   ["foo", "bar baz", "quux"]
145         // where splitArgIndex == 1, we get two results using the argument lists:
146         //   ["foo", "bar", "quux"]
147         //   ["foo", "baz", "quux"]
148         //
149         // Note also that since the splittability of the arguments is technically defined
150         // by the resource bundle path specification (not the xpath regular expression) it
151         // could differ per ResultSpec instance (but currently never does).
152         if (splitArgIndex != -1) {
153             List<String> splitArgs = VALUE_SPLITTER.splitToList(args.get(splitArgIndex));
154             // Only bother if there was more than one argument there anyway.
155             if (splitArgs.size() > 1) {
156                 return splitArgs.stream().map(a -> {
157                     args.set(splitArgIndex, a);
158                     return matchedResult(value, args, varLookupFn);
159                 });
160             }
161         }
162         // No splittable argument, or a splittable argument with only one value.
163         return Stream.of(matchedResult(value, args, varLookupFn));
164     }
165 
166     // Simple helper to make results.
matchedResult( CldrValue value, List<String> args, DynamicVars varLookupFn)167     private Result matchedResult(
168         CldrValue value, List<String> args, DynamicVars varLookupFn) {
169         return new MatchedResult(
170             getRbPath(args),
171             getValues(value.getValue(), args),
172             getResultPath(value.getPath(), args, varLookupFn));
173     }
174 
175     // Resource bundle paths are a bit special (unsurprisingly). The captured arguments can
176     // contain '/' and will extend the path structure. Thus "foo/$1/bar" might end up as
177     // "foo/x/y/bar" after argument substitution.
178     //
179     // However (a hack for timezone "metazone" paths) if the argument placeholder is quoted
180     // (e.g. "foo/"$1"/bar") then '/' in arguments is replaced by ':' and quotes are retained
181     // (e.g. "foo/"x:y"/bar).
182     // TODO: Replace hard coded hack here with an explicit function in the config file.
getRbPath(List<String> args)183     private RbPath getRbPath(List<String> args) {
184         // Without more careful parsing, it's hard to figure out it quotes in a resource bundle
185         // path specification are around a placeholder or not. Since quotes are only used in a
186         // small number of cases currently, and only for this purpose, we just assume that any
187         // quotes in the path specification should trigger this behaviour.
188         if (rbPathSpec.contains("\"")) {
189             // Use a lazy transforming list to avoid char replacement in arguments that don't
190             // appear in the resource bundle path.
191             args = Lists.transform(args, s -> s.replace('/', ':'));
192         }
193         String path = substituteArgs(rbPathSpec, args);
194         return RbPath.parse(path);
195     }
196 
197     // Create an array of output values according to the CLDR value (if present) and the
198     // "values" instruction in the result specification (if present). Any functions present in
199     // the "values" instruction are invoked here.
getValues(String value, List<String> args)200     private ImmutableList<String> getValues(String value, List<String> args) {
201         VarString valuesSpec = instructions.get(Instruction.VALUES);
202         if (valuesSpec == null) {
203             // No "values" instruction, so just use the _unsplit_ CLDR value. To split a CLDR
204             // value use "values={value}" in the result specification.
205             return ImmutableList.of(value);
206         }
207         // The "value" instruction is not expected to have any dynamic %N variables in it,
208         // since those only represent CLDR path mappings, which should not be directly present
209         // in the ICU data. Hence the valueSpec should have been fully resolved by the static
210         // variables applied earlier and we should just need to resolve() it into a String.
211         String resolved = valuesSpec.get();
212 
213         // First substitute the $N arguments in since they need to be passed to the
214         // functions.
215         //
216         // WARNING: This doesn't strictly work, since an argument or function result could
217         // (in theory) contain the string "{value}" which would then be substituted in an
218         // unexpected way. The better way to do this is with a single pass which handles
219         // arguments, function calling and the special "{value}" token together. This comes
220         // down to the fact that the mapping file syntax doesn't have a well defined concept
221         // of escaping or invocation order.
222         // TODO: Fix this, possibly by rewriting the whole transformer "language" to be consistent.
223         resolved = substituteArgs(resolved, args);
224 
225         Matcher m = FUNCTION.matcher(resolved);
226         if (m.find()) {
227             StringBuilder buffer = new StringBuilder();
228             int index = 0;
229             do {
230                 // Append up to the start of the function call.
231                 buffer.append(resolved, index, m.start());
232 
233                 // Replace '{value}' here so functions can be called with the CLDR value as well
234                 // as captured path arguments. We also have to replace it below, which is all a bit
235                 // dodgy if a function every returned '{value}'.
236                 NamedFunction fn = icuFunctions.get(m.group(1));
237                 checkArgument(fn != null, "no such function: %s", m.group(1));
238                 buffer.append(fn.call(m.group(2).replace("{value}", value)));
239                 index = m.end();
240             } while (m.find());
241             resolved = buffer.append(resolved.substring(index)).toString();
242         }
243         // Having done function invocation, we handle the special "{value}" token and split
244         // the value (taking quoting into account).
245         return splitValues(resolved.replace("{value}", value));
246     }
247 
248     // IMPORTANT: The path of a result is either:
249     // * The original distinguishing path
250     // * The specified "base_xpath" (which must also be a distinguishing xpath).
251     // and this is used as part of the equality semantics (which are very subtle).
252     //
253     // The existence of "base_xpath" is a hack to get around the fact the xpaths can only be
254     // matched in full, rather than by a prefix. For some cases this means that the "same"
255     // result will be created many times by potentially different distinguishing xpaths,
256     // perhaps even via different result specifications. "base_xpath" exists as a hack to give
257     // these duplicate results the same "fake" xpath, so deduplication can occur.
getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn)258     private CldrPath getResultPath(CldrPath path, List<String> args, DynamicVars varLookupFn) {
259         VarString basePath = instructions.get(Instruction.BASE_XPATH);
260         if (basePath == null) {
261             return path;
262         }
263         String resolvedBasePath = basePath.apply(dynamicVarFn.andThen(varLookupFn)).get();
264         return parseDistinguishingPath(substituteArgs(resolvedBasePath, args));
265     }
266 
267     /**
268      * Returns a fallback function if this specification has the "fallback=" instruction.
269      * The function takes a resolved resource bundle path and returns the possible fallback
270      * values for it. Note that currently fallback values do not support either quoting or
271      * grouping (but they easily could).
272      */
getFallbackFunction()273     Optional<BiFunction<RbPath, DynamicVars, Optional<Result>>> getFallbackFunction() {
274         VarString fallbackSpec = instructions.get(Instruction.FALLBACK);
275         if (fallbackSpec == null) {
276             return Optional.empty();
277         }
278         // This is the only place where any hacking of regular expressions occurs. The fallback
279         // function must only return a value if the given resolved resource bundle path could
280         // have been a match for the path specification.
281         //
282         // In order to avoid ambiguity for paths such as "foo/$1/$2/bar" and "foo/$1/bar" which
283         // should not both be matched, we explicitly disallow '/' in argument values. In theory
284         // this is problematic, since '/' should be an allowed character, but the issues caused
285         // by ambiguous matching are worse.
286         // TODO: Fix/replace all of this fallback mess with something cleaner.
287         Pattern rbPathMatcher = getRbPathMatcher(rbPathSpec);
288 
289         // Another, frankly terrifying, bit of hackery to support fallback specifications with
290         // $N argument substitution (this currently only happens once, but must be supported).
291         // Just another reason to want to replace the current fallback mechanism.
292         fallbackSpec = maybeRewriteFallbackSpec(fallbackSpec);
293 
294         // Just copying here to make it effectively final.
295         VarString finalFallbackSpec = fallbackSpec;
296         return Optional.of(
297             (p, varFn) -> getFallbackResult(p, varFn, rbPathMatcher, finalFallbackSpec));
298     }
299 
getFallbackResult( RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec)300     private Optional<Result> getFallbackResult(
301         RbPath rbPath, DynamicVars varFn, Pattern rbPathMatcher, VarString fallbackSpec) {
302         // Check is the given rbPath could be associated with this fallback (most are not).
303         Matcher matcher = rbPathMatcher.matcher(rbPath.toString());
304         if (!matcher.matches()) {
305             return Optional.empty();
306         }
307         // Expect that once any dynamic variables are provided to the fallback specification,
308         // we can get the resolved fallback specification (potentially with $N placeholders to
309         // be filled in from the resource bundle path).
310         String specStr = fallbackSpec.apply(dynamicVarFn.andThen(varFn)).get();
311         if (matcher.groupCount() > 0) {
312             specStr = substituteArgs(specStr, n -> matcher.group(n + 1), matcher.groupCount());
313         }
314 
315         // Split the fallback value _without_ considering quoting. This matches the original
316         // behaviour but could cause all sorts of subtle issues if values contained quotes.
317         // TODO: Rework transformation rules to make quoting behaviour deterministic.
318         Iterable<String> values =
319             VALUE_SPLITTER.splitToList(specStr).stream()
320                 // Fallback values that "look like" CLDR paths are auto-magically resolved.
321                 .map(v -> v.startsWith("//") ? varFn.apply(parseDistinguishingPath(v)) : v)
322                 .collect(toImmutableList());
323         return Optional.of(new FallbackResult(rbPath, values));
324     }
325 
326     // WARNING: Another very hacky behaviour (used exactly once) is that "$N" argument
327     // substitutions are allowed in fallback values. This is highly problematic because
328     // since the fallback value must be synthesized only from the resource bundle path,
329     // there's no way for this substitution to handle:
330     // 1: multi-valued list arguments
331     // 2: arguments that didn't appear in the resource bundle path
332     // 3: dynamic path variables (e.g. %D=//some/path)
333     //
334     // An example would be something like a resource bundle specification of:
335     //   /Baz/$2/$1
336     // and a fallback value of:
337     //   Foo$1/Bar$2
338     //
339     // Here the order of substitution is not maintained and the original path specification
340     // has values that are not naturally ordered (or possibly even duplicated). The pattern
341     // we calculate from the resource bundle path specification will match/capture groups in
342     // "natural order" (i.e. "/Baz/(...)/(...)") so we have to rewrite the order of the
343     // placeholders in the fallback specification to match (e.g. "Foo$2/Bar$1").
344     // TODO: Figure out a way to remove all of this extreme complexity.
maybeRewriteFallbackSpec( VarString fallbackSpec)345     private VarString maybeRewriteFallbackSpec(
346         VarString fallbackSpec) {
347         Optional<String> fallback = fallbackSpec.resolve();
348         // If the fallback string is not present, it's because the VarString still has
349         // unresolved "dynamic" variables for late binding. This is okay, but should not
350         // be mixed with argument substitution.
351         if (!fallback.isPresent() || !fallback.get().contains("$")) {
352             return fallbackSpec;
353         }
354         // After the quick rejection check for '$', do a proper search for $N variables (since
355         // '$' is permitted as a literal if not followed by a digit).
356         Matcher fallbackMatcher = ARG_PLACEHOLDER.matcher(fallback.get());
357         if (!fallbackMatcher.find()) {
358             return fallbackSpec;
359         }
360 
361         // Fallback spec has $N in it, triggering super hacky behaviour.
362         Matcher pathMatcher = ARG_PLACEHOLDER.matcher(rbPathSpec);
363         checkState(pathMatcher.find(),
364             "$N arguments in fallback must be present in the resource bundle path: %s",
365             rbPathSpec);
366         // Explicit group characters ("1"..."9") in the order they appear in the
367         // resource bundle path. There can be duplicates (e.g. "/Foo/$1/Bar$1").
368         List<Character> groupIds = new ArrayList<>();
369         do {
370             groupIds.add(pathMatcher.group().charAt(1));
371         } while (pathMatcher.find());
372 
373         // Special check to avoid a horrible bug if we every had more than 9 distinct
374         // placeholders (essentially impossible with current data). If it did happen,
375         // the returned index below would be >= 9 and we would get "$X", where 'X' was
376         // not a numeric value.
377         checkState(groupIds.size() < 10,
378             "too many placeholders in resource bundle path: %s", rbPathSpec);
379 
380         // Now find each placeholder in the fallback specification string and map it to
381         // the equivalent index for the path matcher we just created.
382         StringBuilder rewrittenFallbackSpec = new StringBuilder(fallback.get());
383         do {
384             int placeholderPos = fallbackMatcher.start() + 1;
385             // The new ID is the index of the corresponding placeholder offset by '1'.
386             char placeholderDigit = rewrittenFallbackSpec.charAt(placeholderPos);
387             int newPlaceholderIndex = groupIds.indexOf(placeholderDigit);
388             checkState(newPlaceholderIndex != -1,
389                 "fallback values may only contain arguments from the resource bundle path: %s",
390                 fallback.get());
391             rewrittenFallbackSpec.setCharAt(placeholderPos, (char)('1' + newPlaceholderIndex));
392         } while (fallbackMatcher.find());
393         return VarString.of(rewrittenFallbackSpec.toString());
394     }
395 
396     /** Base class of either a matched or a fallback result. */
397     private abstract class AbstractResult extends Result {
398         // Split and resolved values for this result (see also "isGrouped()").
399         private final ImmutableList<String> values;
400 
401         // The "source" CLDR path of a matched result (omitted if this is a fallback result).
402         // Note that this is the resolved "base_xpath" if it was specified in the instructions.
403         private final Optional<CldrPath> basePath;
404 
405         // Calculated eagerly since we always expect results to need to be deduplicated.
406         private final int hashCode;
407 
408         AbstractResult(RbPath key, Iterable<String> values, Optional<CldrPath> path) {
409             super(key);
410             this.values = ImmutableList.copyOf(values);
411             this.basePath = checkNotNull(path);
412             // Same attributes in the same order as tested for in equals().
413             this.hashCode = Objects.hash(getKey(), getPath(), isGrouped(), getValues());
414         }
415 
416         // Returns the specification from which this result was obtained. This is essential for
417         // correct ordering and determining fallback values, but is not directly used for
418         // determining result equality (since duplicate results can be generated by different
419         // specifications).
420         final ResultSpec getSpec() {
421             return ResultSpec.this;
422         }
423 
424         final Optional<CldrPath> getPath() {
425             return basePath;
426         }
427 
428         final boolean wasMatched() {
429             // We could also do this via a boolean field.
430             return this instanceof MatchedResult;
431         }
432 
433         @Override
434         public final ImmutableList<String> getValues() {
435             return values;
436         }
437 
438         @Override
439         public final int compareTo(Result other) {
440             checkArgument(other instanceof AbstractResult,
441                 "unknown result type: %s", other.getClass());
442             return RESULT_ORDERING.compare(this, (AbstractResult) other);
443         }
444 
445         @Override
446         public final int hashCode() {
447             return hashCode;
448         }
449 
450         // Equality semantics of results is ESSENTIAL for correct behaviour, especially the
451         // deduplication of results. See also "getSpec()", "getPath()", and RESULT_ORDERING.
452         @Override
453         public final boolean equals(Object obj) {
454             // Different subclasses are never equal, so test class directly (not instanceof).
455             if (obj == null || !getClass().equals(obj.getClass())) {
456                 return false;
457             }
458             AbstractResult other = (AbstractResult) obj;
459             // DO NOT test the result specifier here. Equal results can be generated from
460             // different result specifications (if "base_xpath" was used).
461             return getKey().equals(other.getKey())
462                 && getPath().equals(other.getPath())
463                 && isGrouped() == other.isGrouped()
464                 // Alternatively assert that values are equal if everything else is.
465                 && getValues().equals(other.getValues());
466         }
467     }
468 
469     // Result created for an explicit path match using captured arguments.
470     private final class MatchedResult extends AbstractResult {
471         MatchedResult(RbPath key, Iterable<String> values, CldrPath path) {
472             super(key, values, Optional.of(path));
473         }
474 
475         @Override
476         public boolean isGrouped() {
477             // We don't need to use the "group" value at all and it can be removed from the
478             // configuration file at some point.
479             return instructions.containsKey(Instruction.GROUP);
480         }
481 
482         @Override
483         public boolean isFallbackFor(Result r) {
484             // Matched results are never a fallback for anything.
485             return false;
486         }
487     }
488 
489     // Result created to hold possible fallback values for a specified resource bundle path.
490     private final class FallbackResult extends AbstractResult {
491         FallbackResult(RbPath rbPath, Iterable<String> values) {
492             super(rbPath, values, Optional.empty());
493         }
494 
495         // Delete this method and move the other one into AbstractResult if we decide to allow
496         // grouping for fallback values (it's not clear if it's a good idea).
497         @Override
498         public boolean isGrouped() {
499             return false;
500         }
501 
502         @Override
503         public boolean isFallbackFor(Result r) {
504             // We are a fallback if we came from the same specification as a matched result.
505             // To prevent duplication of fallback results, we also return true if the result we
506             // are "equal()" to the given result (equivalent fallback results can come from
507             // different input paths).
508             checkArgument(r instanceof AbstractResult, "unsupported result type: %s", r);
509             AbstractResult result = (AbstractResult) r;
510             return result.wasMatched() ? getSpec().equals(result.getSpec()) : equals(result);
511         }
512     }
513 
514     // ==== Static helper functions ====
515 
516     // Matches any "$N" placeholder without capturing.
517     private static final Pattern ARG_PLACEHOLDER = Pattern.compile("\\$[1-9]");
518 
519     // Turn "$N" into a capturing groups.
520     //
521     // Note that this code currently assumes that each "$N" placeholder matches a single path
522     // segment (i.e. the captured values cannot contain '/'). This is an artificial restriction
523     // since resource bundle paths can have quoting in, so we could detect quoted placeholders
524     // and allow any characters. However at the moment this isn't an issue, and none of the
525     // "$N" placeholders in the paths expects to match anything with '/' in.
526     //
527     // TODO: Fix this to handle quoted placeholders (e.g. "$N" or <$N>) properly.
528     private static Pattern getRbPathMatcher(String rbPathSpec) {
529         // An RbPath instance's toString() does not have a leading '/' on it, so well have to
530         // account for that here (or we could just remove the leading '/' from paths in the
531         // config file...
532         if (rbPathSpec.startsWith("/")) {
533             rbPathSpec = rbPathSpec.substring(1);
534         }
535         // Protect potential regex meta-characters in the original resource bundle path. Using
536         // '\Q' and '\E' to mark quotation boundaries is the safest way to do this, but that
537         // means we also need to handle '\E' in the original string (incredibly unlikely but it
538         // would be super hard to debug if it ever happened).
539         // TODO: If resource paths cannot contain literal '\' or '$', add checks and simplify.
540         String regex = "\\Q" + rbPathSpec.replace("\\E", "\\E\\E\\Q") + "\\E";
541 
542         // Remember that you could get "$1$2" here and the regex groups that replace them will
543         // abut. Use reluctant matching (i.e. "+?") to avoid any backtracking in this case.
544         // We assume that the substituted arguments contained at least one character, and so we
545         // capture at least one character per group here.
546         regex = ARG_PLACEHOLDER.matcher(regex).replaceAll("\\\\E([^/]+?)\\\\Q");
547         return Pattern.compile(regex);
548     }
549 
550     private static String substituteArgs(String spec, List<String> args) {
551         return substituteArgs(spec, args::get, args.size());
552     }
553 
554     // Substitutes "$N" (N = 1...9) placeholders for values obtained from a zero-indexed
555     // function (i.e. "$N" --> args(N - 1)).
556     private static String substituteArgs(String spec, Function<Integer, String> args, int size) {
557         return RegexTransformer.substitute(
558             spec, '$', c -> args.apply(checkElementIndex(c - '1', size, "argument index")));
559     }
560 
561     // Matches arguments with or without enclosing quotes.
562     private static final Pattern ARGUMENT = Pattern.compile("[<\"]?\\$(\\d)[\">]?");
563 
564     // Logic mostly copied from original RegexManager class. Finds first unquoted $N (N=1..9)
565     // and returns N-1 (or -1 if no match). We do not permit $0 to appear even though it is
566     // captured by the regex because it's just the entire path.
getSplitArgIndex(String rbPath)567     private static int getSplitArgIndex(String rbPath) {
568         // Captures a $N placeholder, but might catch surrounding quoting as well.
569         Matcher matcher = ARGUMENT.matcher(rbPath);
570         while (matcher.find()) {
571             char startChar = rbPath.charAt(matcher.start());
572             char endChar = rbPath.charAt(matcher.end() - 1);
573             // Splitting occurs for the first unquoted placeholder, so ignore <$1> and "$N".
574             // Q: Why two different "quoting" schemes?
575             // A: It's complex and relates the something called "hidden labels".
576             boolean shouldSplit = !((startChar == '"' && endChar == '"') ||
577                 (startChar == '<' && endChar == '>'));
578             if (shouldSplit) {
579                 // Allowed "$N" argument placeholders go from $1 to $9 ($0 is disallowed) and
580                 // arguments are zero-indexed, so we expect an index from 0 to 8.
581                 int groupNumber = Integer.parseInt(matcher.group(1));
582                 checkArgument(groupNumber >= 1 && groupNumber <= 9,
583                     "invalid split argument: %s", groupNumber);
584                 return groupNumber - 1;
585             }
586         }
587         return -1;
588     }
589 
590     // Splits a possibly quoted string, where we need to handle \". This is a bit dubious
591     // though as we don't detect or unescape \\. Thus it's impossible to represent a single '\'
592     // at the end of a quoted string (e.g. "$1" where the expansion of $1 has a trailing '\'.
593     // It's also impossible to have a value that should be split but which contains '"'.
594     //
595     // This mimics the original RegexManager behaviour where spaces in and quotes in
596     // substituted values are _not_ escaped.
splitValues(String value)597     private static ImmutableList<String> splitValues(String value) {
598         int qstart = nextBareQuoteIndex(value,  0);
599         if (qstart == -1) {
600             return ImmutableList.copyOf(VALUE_SPLITTER.split(value));
601         }
602         ImmutableList.Builder<String> values = ImmutableList.builder();
603         int rawStart = 0;
604         do {
605             values.addAll(VALUE_SPLITTER.split(value.substring(rawStart, qstart)));
606             int qend = nextBareQuoteIndex(value,  qstart + 1);
607             checkArgument(qend != -1, "mismatched quotes in splittable value: %s", value);
608             // Remember to unescape any '"' found in the quoted regions.
609             values.add(value.substring(qstart + 1, qend).replace("\\\"", "\""));
610             rawStart = qend + 1;
611             qstart = nextBareQuoteIndex(value,  qend + 1);
612         } while (qstart != -1);
613         values.addAll(VALUE_SPLITTER.split(value.substring(rawStart)));
614         return values.build();
615     }
616 
617     // Returns the index of the next '"' character that's not preceded by a '\'.
nextBareQuoteIndex(String s, int i)618     private static int nextBareQuoteIndex(String s, int i) {
619         i = s.indexOf('"', i);
620         // If i == 0, then '"' is the first char and must be "bare".
621         if (i > 0) {
622             do {
623                 if (s.charAt(i - 1) != '\\') {
624                     break;
625                 }
626                 i = s.indexOf('\\', i + 1);
627             } while (i >= 0);
628         }
629         return i;
630     }
631 }
632