• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.i18n.phonenumbers.metadata.regex;
18 
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.base.Preconditions.checkNotNull;
21 import static com.google.common.collect.ImmutableList.toImmutableList;
22 
23 import com.google.common.annotations.VisibleForTesting;
24 import com.google.common.collect.ImmutableList;
25 import com.google.common.collect.LinkedHashMultiset;
26 import com.google.common.collect.Multiset;
27 import com.google.i18n.phonenumbers.metadata.RangeSpecification;
28 import com.google.i18n.phonenumbers.metadata.RangeTree;
29 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
30 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
31 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
32 import java.util.ArrayList;
33 import java.util.List;
34 import java.util.Optional;
35 import java.util.stream.IntStream;
36 import javax.annotation.Nullable;
37 
38 /**
39  * An optimization for RangeTree DFAs which attempts to isolate and extract subgraphs which would
40  * otherwise cause a lot of repetition in the generated regular expression.
41  */
42 public final class SubgroupOptimizer {
43   /**
44    * Returns the subgraph which is likely to cause the most repetition in the regular expression
45    * of the given DFA. Subtracting the result out of the original range tree and generating two
46    * distinct regular expressions is likely to be shorter than the regular expression of the
47    * original range.
48    */
extractRepeatingSubgraph(RangeTree ranges)49   public static Optional<RangeTree> extractRepeatingSubgraph(RangeTree ranges) {
50     return LinkNodeVisitor
51         .findBridgingNode(ranges)
52         .flatMap(n -> SubgraphExtractionVisitor.extractSubgraph(ranges, n));
53   }
54 
55   /**
56    * A visitor which applies two types of weights to every interior node in a DFA.
57    * <ul>
58    *   <li>A count of incoming edges to that node.
59    *   <li>A count of all edges in the subgraph rooted at that node.
60    * </ul>
61    * These are then multiplied together using the cost function:
62    * <pre>cost(n) = subgraph-weight(n) * (in-order(n) - 1)</pre>
63    * get get a proxy for the cost of additional duplicates likely to be created by this node.
64    */
65   static class LinkNodeVisitor implements DfaVisitor {
66     // Reasonable approximation for the cost of an edge in a subgraph is the length of the
67     // corresponding range specification (it doesn't work so well for repeated edges like
68     // 'xxxxxxxx' --> "\d{8}", but it's good to help break ties in the cost function).
69     private static final ImmutableList<Integer> EDGE_WEIGHTS =
70         IntStream.rangeClosed(1, 0x3FF)
71             .mapToObj(m -> RangeSpecification.toString(m).length())
72             .collect(toImmutableList());
73 
74     // Important to use "linked" multisets here (at least for the one we iterate over) since
75     // otherwise we end up with non-deterministic regular expression generation.
76     private final Multiset<DfaNode> inOrder = LinkedHashMultiset.create();
77     private final Multiset<DfaNode> subgraphWeight = LinkedHashMultiset.create();
78 
79     /**
80      * Returns the interior node whose subgraph is likely to cause the most repetition in the
81      * regular expression of the given DFA.
82      */
findBridgingNode(RangeTree ranges)83     static Optional<DfaNode> findBridgingNode(RangeTree ranges) {
84       checkArgument(!ranges.isEmpty(), "cannot visit empty ranges");
85       LinkNodeVisitor v = new LinkNodeVisitor();
86       ranges.accept(v);
87       return Optional.ofNullable(v.getHighestCostNode());
88     }
89 
getEdgeWeight(DfaEdge edge)90     private static int getEdgeWeight(DfaEdge edge) {
91       // Subtract 1 since the array is 1-based (a zero edge mask is not legal).
92       return EDGE_WEIGHTS.get(edge.getDigitMask() - 1);
93     }
94 
95     @VisibleForTesting
getSubgraphWeight(DfaNode n)96     int getSubgraphWeight(DfaNode n) {
97       return subgraphWeight.count(n);
98     }
99 
100     @VisibleForTesting
getInOrder(DfaNode n)101     int getInOrder(DfaNode n) {
102       return inOrder.count(n);
103     }
104 
105     // This returns null if no edge has a cost greater than zero. Since the cost function uses
106     // (in-order(n) - 1) this is trivially true for any graph where all interior nodes have only
107     // a single in-edge (the terminal node can have more than one in-edge, but it has a weight of
108     // zero and the initial node is never considered a candidate).
109     @VisibleForTesting
110     @Nullable
getHighestCostNode()111     DfaNode getHighestCostNode() {
112       DfaNode node = null;
113       int maxWeight = 0;
114       for (DfaNode n : inOrder.elementSet()) {
115         int weight = getSubgraphWeight(n) * (getInOrder(n) - 1);
116         if (weight > maxWeight) {
117           maxWeight = weight;
118           node = n;
119         }
120       }
121       return node;
122     }
123 
124     @Override
visit(DfaNode source, DfaEdge edge, DfaNode target)125     public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
126       // The weight is zero only if we haven't visited this node before (or it's the terminal).
127       int targetWeight = subgraphWeight.count(target);
128       if (targetWeight == 0 && !target.equals(RangeTree.getTerminal())) {
129         target.accept(this);
130         targetWeight = subgraphWeight.count(target);
131       }
132       // Add an extra one for the edge we are processing now and increment our target's in-order.
133       subgraphWeight.add(source, targetWeight + getEdgeWeight(edge));
134       inOrder.add(target);
135     }
136   }
137 
138   /**
139    * A visitor to extract the subgraph of a DFA which passes through a specified interior
140    * "bridging" node.
141    */
142   private static class SubgraphExtractionVisitor implements DfaVisitor {
143     private final DfaNode bridgingNode;
144     private final List<RangeSpecification> paths = new ArrayList<>();
145     private RangeSpecification path = RangeSpecification.empty();
146     private boolean sawBridgingNode = false;
147     private boolean splitHappens = false;
148 
149     /** Returns the subgraph which passes through the specified node. */
extractSubgraph(RangeTree ranges, DfaNode node)150     static Optional<RangeTree> extractSubgraph(RangeTree ranges, DfaNode node) {
151       SubgraphExtractionVisitor v = new SubgraphExtractionVisitor(node);
152       ranges.accept(v);
153       // Only return proper subgraphs.
154       return v.splitHappens ? Optional.of(RangeTree.from(v.paths)) : Optional.empty();
155     }
156 
SubgraphExtractionVisitor(DfaNode bridgingNode)157     private SubgraphExtractionVisitor(DfaNode bridgingNode) {
158       this.bridgingNode = checkNotNull(bridgingNode);
159     }
160 
161     @Override
visit(DfaNode source, DfaEdge edge, DfaNode target)162     public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
163       RangeSpecification oldPath = path;
164       path = path.extendByMask(edge.getDigitMask());
165       // Potentially emit paths for any terminating node (not just the end of the graph). We have
166       // to extract the entire sub-graph _after_ the bridging node, including terminating nodes.
167       if (target.canTerminate()) {
168         // Emit path if we are "below" the bridging node.
169         if (sawBridgingNode) {
170           paths.add(path);
171         } else {
172           // Records that there were other paths not in the subgroup (since we only want to return
173           // a new DFA that's a proper subgraph of the original graph).
174           splitHappens = true;
175         }
176       }
177       if (target.equals(bridgingNode)) {
178         // Recurse with the flag set to emit paths once we hit the terminal node (note that the
179         // bridging node cannot be the terminal node).
180         sawBridgingNode = true;
181         target.accept(this);
182         sawBridgingNode = false;
183       } else {
184         // Recurse normally regardless of the flag.
185         target.accept(this);
186       }
187       path = oldPath;
188     }
189   }
190 }
191