1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers.metadata.regex; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.common.base.Preconditions.checkNotNull; 21 import static com.google.common.collect.ImmutableList.toImmutableList; 22 23 import com.google.common.annotations.VisibleForTesting; 24 import com.google.common.collect.ImmutableList; 25 import com.google.common.collect.LinkedHashMultiset; 26 import com.google.common.collect.Multiset; 27 import com.google.i18n.phonenumbers.metadata.RangeSpecification; 28 import com.google.i18n.phonenumbers.metadata.RangeTree; 29 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; 30 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; 31 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; 32 import java.util.ArrayList; 33 import java.util.List; 34 import java.util.Optional; 35 import java.util.stream.IntStream; 36 import javax.annotation.Nullable; 37 38 /** 39 * An optimization for RangeTree DFAs which attempts to isolate and extract subgraphs which would 40 * otherwise cause a lot of repetition in the generated regular expression. 41 */ 42 public final class SubgroupOptimizer { 43 /** 44 * Returns the subgraph which is likely to cause the most repetition in the regular expression 45 * of the given DFA. Subtracting the result out of the original range tree and generating two 46 * distinct regular expressions is likely to be shorter than the regular expression of the 47 * original range. 48 */ extractRepeatingSubgraph(RangeTree ranges)49 public static Optional<RangeTree> extractRepeatingSubgraph(RangeTree ranges) { 50 return LinkNodeVisitor 51 .findBridgingNode(ranges) 52 .flatMap(n -> SubgraphExtractionVisitor.extractSubgraph(ranges, n)); 53 } 54 55 /** 56 * A visitor which applies two types of weights to every interior node in a DFA. 57 * <ul> 58 * <li>A count of incoming edges to that node. 59 * <li>A count of all edges in the subgraph rooted at that node. 60 * </ul> 61 * These are then multiplied together using the cost function: 62 * <pre>cost(n) = subgraph-weight(n) * (in-order(n) - 1)</pre> 63 * get get a proxy for the cost of additional duplicates likely to be created by this node. 64 */ 65 static class LinkNodeVisitor implements DfaVisitor { 66 // Reasonable approximation for the cost of an edge in a subgraph is the length of the 67 // corresponding range specification (it doesn't work so well for repeated edges like 68 // 'xxxxxxxx' --> "\d{8}", but it's good to help break ties in the cost function). 69 private static final ImmutableList<Integer> EDGE_WEIGHTS = 70 IntStream.rangeClosed(1, 0x3FF) 71 .mapToObj(m -> RangeSpecification.toString(m).length()) 72 .collect(toImmutableList()); 73 74 // Important to use "linked" multisets here (at least for the one we iterate over) since 75 // otherwise we end up with non-deterministic regular expression generation. 76 private final Multiset<DfaNode> inOrder = LinkedHashMultiset.create(); 77 private final Multiset<DfaNode> subgraphWeight = LinkedHashMultiset.create(); 78 79 /** 80 * Returns the interior node whose subgraph is likely to cause the most repetition in the 81 * regular expression of the given DFA. 82 */ findBridgingNode(RangeTree ranges)83 static Optional<DfaNode> findBridgingNode(RangeTree ranges) { 84 checkArgument(!ranges.isEmpty(), "cannot visit empty ranges"); 85 LinkNodeVisitor v = new LinkNodeVisitor(); 86 ranges.accept(v); 87 return Optional.ofNullable(v.getHighestCostNode()); 88 } 89 getEdgeWeight(DfaEdge edge)90 private static int getEdgeWeight(DfaEdge edge) { 91 // Subtract 1 since the array is 1-based (a zero edge mask is not legal). 92 return EDGE_WEIGHTS.get(edge.getDigitMask() - 1); 93 } 94 95 @VisibleForTesting getSubgraphWeight(DfaNode n)96 int getSubgraphWeight(DfaNode n) { 97 return subgraphWeight.count(n); 98 } 99 100 @VisibleForTesting getInOrder(DfaNode n)101 int getInOrder(DfaNode n) { 102 return inOrder.count(n); 103 } 104 105 // This returns null if no edge has a cost greater than zero. Since the cost function uses 106 // (in-order(n) - 1) this is trivially true for any graph where all interior nodes have only 107 // a single in-edge (the terminal node can have more than one in-edge, but it has a weight of 108 // zero and the initial node is never considered a candidate). 109 @VisibleForTesting 110 @Nullable getHighestCostNode()111 DfaNode getHighestCostNode() { 112 DfaNode node = null; 113 int maxWeight = 0; 114 for (DfaNode n : inOrder.elementSet()) { 115 int weight = getSubgraphWeight(n) * (getInOrder(n) - 1); 116 if (weight > maxWeight) { 117 maxWeight = weight; 118 node = n; 119 } 120 } 121 return node; 122 } 123 124 @Override visit(DfaNode source, DfaEdge edge, DfaNode target)125 public void visit(DfaNode source, DfaEdge edge, DfaNode target) { 126 // The weight is zero only if we haven't visited this node before (or it's the terminal). 127 int targetWeight = subgraphWeight.count(target); 128 if (targetWeight == 0 && !target.equals(RangeTree.getTerminal())) { 129 target.accept(this); 130 targetWeight = subgraphWeight.count(target); 131 } 132 // Add an extra one for the edge we are processing now and increment our target's in-order. 133 subgraphWeight.add(source, targetWeight + getEdgeWeight(edge)); 134 inOrder.add(target); 135 } 136 } 137 138 /** 139 * A visitor to extract the subgraph of a DFA which passes through a specified interior 140 * "bridging" node. 141 */ 142 private static class SubgraphExtractionVisitor implements DfaVisitor { 143 private final DfaNode bridgingNode; 144 private final List<RangeSpecification> paths = new ArrayList<>(); 145 private RangeSpecification path = RangeSpecification.empty(); 146 private boolean sawBridgingNode = false; 147 private boolean splitHappens = false; 148 149 /** Returns the subgraph which passes through the specified node. */ extractSubgraph(RangeTree ranges, DfaNode node)150 static Optional<RangeTree> extractSubgraph(RangeTree ranges, DfaNode node) { 151 SubgraphExtractionVisitor v = new SubgraphExtractionVisitor(node); 152 ranges.accept(v); 153 // Only return proper subgraphs. 154 return v.splitHappens ? Optional.of(RangeTree.from(v.paths)) : Optional.empty(); 155 } 156 SubgraphExtractionVisitor(DfaNode bridgingNode)157 private SubgraphExtractionVisitor(DfaNode bridgingNode) { 158 this.bridgingNode = checkNotNull(bridgingNode); 159 } 160 161 @Override visit(DfaNode source, DfaEdge edge, DfaNode target)162 public void visit(DfaNode source, DfaEdge edge, DfaNode target) { 163 RangeSpecification oldPath = path; 164 path = path.extendByMask(edge.getDigitMask()); 165 // Potentially emit paths for any terminating node (not just the end of the graph). We have 166 // to extract the entire sub-graph _after_ the bridging node, including terminating nodes. 167 if (target.canTerminate()) { 168 // Emit path if we are "below" the bridging node. 169 if (sawBridgingNode) { 170 paths.add(path); 171 } else { 172 // Records that there were other paths not in the subgroup (since we only want to return 173 // a new DFA that's a proper subgraph of the original graph). 174 splitHappens = true; 175 } 176 } 177 if (target.equals(bridgingNode)) { 178 // Recurse with the flag set to emit paths once we hit the terminal node (note that the 179 // bridging node cannot be the terminal node). 180 sawBridgingNode = true; 181 target.accept(this); 182 sawBridgingNode = false; 183 } else { 184 // Recurse normally regardless of the flag. 185 target.accept(this); 186 } 187 path = oldPath; 188 } 189 } 190 } 191