• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.i18n.phonenumbers.metadata.regex;
18 
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING;
21 import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES;
22 import static java.util.stream.Collectors.joining;
23 
24 import com.google.common.base.Preconditions;
25 import com.google.common.graph.ValueGraph;
26 import com.google.i18n.phonenumbers.metadata.RangeTree;
27 import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer;
28 import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy;
29 import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge;
30 import java.util.Optional;
31 
32 /** Produces partially optimized regular expressions from {@code RangeTree}s. */
33 public final class RegexGenerator {
34   private static final RegexGenerator BASIC = new RegexGenerator(false, false, false, false);
35 
36   // NOTE: Tail optimization should remain disabled since it seems to undo some of the benefits of
37   // subgroup optimization. At some point the code can probably just be removed.
38   private static final RegexGenerator DEFAULT_XML =
39       BASIC.withDfaFactorization().withSubgroupOptimization();
40 
41   /**
42    * Returns a basic regular expression generator with no optional optimizations enabled. This will
43    * produce regular expressions with a simpler structure than other generators but output will
44    * almost always be longer.
45    */
basic()46   public static RegexGenerator basic() {
47     return BASIC;
48   }
49 
50   /**
51    * Returns the default regex generator for XML data. This should be used by any tool wishing to
52    * obtain the same regular expressions as the legacy XML data. It is deliberately not specified
53    * as to which optimizations are enabled for this regular expression generator.
54    */
defaultXmlGenerator()55   public static RegexGenerator defaultXmlGenerator() {
56     return DEFAULT_XML;
57   }
58 
59   /**
60    * Returns a new regular expression generator which uses the {@code '.'} token for matching any
61    * digit (rather than {@code '\d'}). This results in shorter output, but possibly at the cost of
62    * performance on certain platforms (and a degree of readability).
63    */
withDotMatch()64   public RegexGenerator withDotMatch() {
65     Preconditions.checkState(!this.useDotMatch, "Dot-matching already enabled");
66     return new RegexGenerator(true, this.factorizeDfa, this.optimizeSubgroups, this.optimizeTail);
67   }
68 
69   /**
70    * Returns a new regular expression generator which applies a length-based factorization of the
71    * DFA graph in an attempt to reduce the number of problematic terminating states. This results
72    * in regular expressions with additional non-determinism, but which can greatly reduce size.
73    */
withDfaFactorization()74   public RegexGenerator withDfaFactorization() {
75     Preconditions.checkState(!this.factorizeDfa, "Length based factorizing already enabled");
76     return new RegexGenerator(this.useDotMatch, true, this.optimizeSubgroups, this.optimizeTail);
77   }
78 
79   /**
80    * Returns a new regular expression generator which applies experimental factorization of the
81    * DFA graph in an attempt to identify and handle subgroups which would cause repetition. This
82    * results in regular expressions with additional non-determinism, but which can greatly reduce
83    * size.
84    */
withSubgroupOptimization()85   public RegexGenerator withSubgroupOptimization() {
86     Preconditions.checkState(!this.optimizeSubgroups, "Subgroup optimization already enabled");
87     return new RegexGenerator(this.useDotMatch, this.factorizeDfa, true, this.optimizeTail);
88   }
89 
90   /**
91    * Returns a new regular expression generator which applies tail optimization to the intermediate
92    * NFA graph to factor out common trailing paths. This results in a small size improvement to
93    * many cases and does not adversely affect readability.
94    */
withTailOptimization()95   public RegexGenerator withTailOptimization() {
96     Preconditions.checkState(!this.optimizeTail, "Tail optimization already enabled");
97     return new RegexGenerator(this.useDotMatch, this.factorizeDfa, this.optimizeSubgroups, true);
98   }
99 
100   private final boolean useDotMatch;
101   private final boolean factorizeDfa;
102   private final boolean optimizeSubgroups;
103   private final boolean optimizeTail;
104 
RegexGenerator( boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail)105   private RegexGenerator(
106       boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail) {
107     this.useDotMatch = useDotMatch;
108     this.factorizeDfa = factorizeDfa;
109     this.optimizeSubgroups = optimizeSubgroups;
110     this.optimizeTail = optimizeTail;
111   }
112 
113   /**
114    * Generates a regular expression from a range tree, applying the configured options for this
115    * generator.
116    */
toRegex(RangeTree ranges)117   public String toRegex(RangeTree ranges) {
118     // The regex of the empty range is "a regex that matches nothing". This is meaningless.
119     checkArgument(!ranges.isEmpty(),
120         "cannot generate regular expression from empty ranges");
121     // We cannot generate any regular expressions if there are no explicit state transitions in the
122     // graph (i.e. we can generate "(?:<re>)?" but only if "<re>" is non-empty). We just get
123     // "the regex that always immediately terminates after no input". This is also meaningless.
124     checkArgument(!ranges.getInitial().equals(RangeTree.getTerminal()),
125         "range tree must not contain only the empty digit sequence: %s", ranges);
126 
127     String regex = generateFactorizedRegex(ranges);
128     if (optimizeSubgroups) {
129       regex = recursivelyOptimizeSubgroups(ranges, regex);
130     }
131     return regex;
132   }
133 
recursivelyOptimizeSubgroups(RangeTree ranges, String regex)134   private String recursivelyOptimizeSubgroups(RangeTree ranges, String regex) {
135     Optional<RangeTree> subgraphRanges = SubgroupOptimizer.extractRepeatingSubgraph(ranges);
136     if (subgraphRanges.isPresent()) {
137       RangeTree leftoverRanges = ranges.subtract(subgraphRanges.get());
138       String leftoverRegex = generateFactorizedRegex(leftoverRanges);
139       leftoverRegex = recursivelyOptimizeSubgroups(leftoverRanges, leftoverRegex);
140       String optimizedRegex = leftoverRegex + "|" + generateFactorizedRegex(subgraphRanges.get());
141       if (optimizedRegex.length() < regex.length()) {
142         regex = optimizedRegex;
143       }
144     }
145     return regex;
146   }
147 
generateFactorizedRegex(RangeTree ranges)148   private String generateFactorizedRegex(RangeTree ranges) {
149     String regex = regexOf(ranges);
150     if (factorizeDfa) {
151       regex = generateFactorizedRegex(ranges, regex, REQUIRE_EQUAL_EDGES);
152       regex = generateFactorizedRegex(ranges, regex, ALLOW_EDGE_SPLITTING);
153     }
154     return regex;
155   }
156 
generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy)157   private String generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy) {
158     String factoredRegex = RangeTreeFactorizer.factor(dfa, strategy).stream()
159         .map(this::regexOf)
160         .collect(joining("|"));
161     return factoredRegex.length() < bestRegex.length() ? factoredRegex : bestRegex;
162   }
163 
regexOf(RangeTree ranges)164   private String regexOf(RangeTree ranges) {
165     ValueGraph<Node, SimpleEdge> nfa = RangeTreeConverter.toNfaGraph(ranges);
166     if (optimizeTail) {
167       nfa = TrailingPathOptimizer.optimize(nfa);
168     }
169     return EdgeWriter.toRegex(NfaFlattener.flatten(nfa), useDotMatch);
170   }
171 }
172