1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers.metadata.regex; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING; 21 import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES; 22 import static java.util.stream.Collectors.joining; 23 24 import com.google.common.base.Preconditions; 25 import com.google.common.graph.ValueGraph; 26 import com.google.i18n.phonenumbers.metadata.RangeTree; 27 import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer; 28 import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy; 29 import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; 30 import java.util.Optional; 31 32 /** Produces partially optimized regular expressions from {@code RangeTree}s. */ 33 public final class RegexGenerator { 34 private static final RegexGenerator BASIC = new RegexGenerator(false, false, false, false); 35 36 // NOTE: Tail optimization should remain disabled since it seems to undo some of the benefits of 37 // subgroup optimization. At some point the code can probably just be removed. 38 private static final RegexGenerator DEFAULT_XML = 39 BASIC.withDfaFactorization().withSubgroupOptimization(); 40 41 /** 42 * Returns a basic regular expression generator with no optional optimizations enabled. This will 43 * produce regular expressions with a simpler structure than other generators but output will 44 * almost always be longer. 45 */ basic()46 public static RegexGenerator basic() { 47 return BASIC; 48 } 49 50 /** 51 * Returns the default regex generator for XML data. This should be used by any tool wishing to 52 * obtain the same regular expressions as the legacy XML data. It is deliberately not specified 53 * as to which optimizations are enabled for this regular expression generator. 54 */ defaultXmlGenerator()55 public static RegexGenerator defaultXmlGenerator() { 56 return DEFAULT_XML; 57 } 58 59 /** 60 * Returns a new regular expression generator which uses the {@code '.'} token for matching any 61 * digit (rather than {@code '\d'}). This results in shorter output, but possibly at the cost of 62 * performance on certain platforms (and a degree of readability). 63 */ withDotMatch()64 public RegexGenerator withDotMatch() { 65 Preconditions.checkState(!this.useDotMatch, "Dot-matching already enabled"); 66 return new RegexGenerator(true, this.factorizeDfa, this.optimizeSubgroups, this.optimizeTail); 67 } 68 69 /** 70 * Returns a new regular expression generator which applies a length-based factorization of the 71 * DFA graph in an attempt to reduce the number of problematic terminating states. This results 72 * in regular expressions with additional non-determinism, but which can greatly reduce size. 73 */ withDfaFactorization()74 public RegexGenerator withDfaFactorization() { 75 Preconditions.checkState(!this.factorizeDfa, "Length based factorizing already enabled"); 76 return new RegexGenerator(this.useDotMatch, true, this.optimizeSubgroups, this.optimizeTail); 77 } 78 79 /** 80 * Returns a new regular expression generator which applies experimental factorization of the 81 * DFA graph in an attempt to identify and handle subgroups which would cause repetition. This 82 * results in regular expressions with additional non-determinism, but which can greatly reduce 83 * size. 84 */ withSubgroupOptimization()85 public RegexGenerator withSubgroupOptimization() { 86 Preconditions.checkState(!this.optimizeSubgroups, "Subgroup optimization already enabled"); 87 return new RegexGenerator(this.useDotMatch, this.factorizeDfa, true, this.optimizeTail); 88 } 89 90 /** 91 * Returns a new regular expression generator which applies tail optimization to the intermediate 92 * NFA graph to factor out common trailing paths. This results in a small size improvement to 93 * many cases and does not adversely affect readability. 94 */ withTailOptimization()95 public RegexGenerator withTailOptimization() { 96 Preconditions.checkState(!this.optimizeTail, "Tail optimization already enabled"); 97 return new RegexGenerator(this.useDotMatch, this.factorizeDfa, this.optimizeSubgroups, true); 98 } 99 100 private final boolean useDotMatch; 101 private final boolean factorizeDfa; 102 private final boolean optimizeSubgroups; 103 private final boolean optimizeTail; 104 RegexGenerator( boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail)105 private RegexGenerator( 106 boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail) { 107 this.useDotMatch = useDotMatch; 108 this.factorizeDfa = factorizeDfa; 109 this.optimizeSubgroups = optimizeSubgroups; 110 this.optimizeTail = optimizeTail; 111 } 112 113 /** 114 * Generates a regular expression from a range tree, applying the configured options for this 115 * generator. 116 */ toRegex(RangeTree ranges)117 public String toRegex(RangeTree ranges) { 118 // The regex of the empty range is "a regex that matches nothing". This is meaningless. 119 checkArgument(!ranges.isEmpty(), 120 "cannot generate regular expression from empty ranges"); 121 // We cannot generate any regular expressions if there are no explicit state transitions in the 122 // graph (i.e. we can generate "(?:<re>)?" but only if "<re>" is non-empty). We just get 123 // "the regex that always immediately terminates after no input". This is also meaningless. 124 checkArgument(!ranges.getInitial().equals(RangeTree.getTerminal()), 125 "range tree must not contain only the empty digit sequence: %s", ranges); 126 127 String regex = generateFactorizedRegex(ranges); 128 if (optimizeSubgroups) { 129 regex = recursivelyOptimizeSubgroups(ranges, regex); 130 } 131 return regex; 132 } 133 recursivelyOptimizeSubgroups(RangeTree ranges, String regex)134 private String recursivelyOptimizeSubgroups(RangeTree ranges, String regex) { 135 Optional<RangeTree> subgraphRanges = SubgroupOptimizer.extractRepeatingSubgraph(ranges); 136 if (subgraphRanges.isPresent()) { 137 RangeTree leftoverRanges = ranges.subtract(subgraphRanges.get()); 138 String leftoverRegex = generateFactorizedRegex(leftoverRanges); 139 leftoverRegex = recursivelyOptimizeSubgroups(leftoverRanges, leftoverRegex); 140 String optimizedRegex = leftoverRegex + "|" + generateFactorizedRegex(subgraphRanges.get()); 141 if (optimizedRegex.length() < regex.length()) { 142 regex = optimizedRegex; 143 } 144 } 145 return regex; 146 } 147 generateFactorizedRegex(RangeTree ranges)148 private String generateFactorizedRegex(RangeTree ranges) { 149 String regex = regexOf(ranges); 150 if (factorizeDfa) { 151 regex = generateFactorizedRegex(ranges, regex, REQUIRE_EQUAL_EDGES); 152 regex = generateFactorizedRegex(ranges, regex, ALLOW_EDGE_SPLITTING); 153 } 154 return regex; 155 } 156 generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy)157 private String generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy) { 158 String factoredRegex = RangeTreeFactorizer.factor(dfa, strategy).stream() 159 .map(this::regexOf) 160 .collect(joining("|")); 161 return factoredRegex.length() < bestRegex.length() ? factoredRegex : bestRegex; 162 } 163 regexOf(RangeTree ranges)164 private String regexOf(RangeTree ranges) { 165 ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(ranges); 166 if (optimizeTail) { 167 nfa = TrailingPathOptimizer.optimize(nfa); 168 } 169 return EdgeWriter.toRegex(NfaFlattener.flatten(nfa), useDotMatch); 170 } 171 } 172