1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.common.collect.ImmutableMap.toImmutableMap; 21 import static com.google.common.collect.ImmutableSet.toImmutableSet; 22 import static java.lang.Integer.numberOfTrailingZeros; 23 24 import com.google.common.base.Joiner; 25 import com.google.common.base.Preconditions; 26 import com.google.common.collect.ImmutableList; 27 import com.google.common.collect.ImmutableMap; 28 import com.google.common.collect.ImmutableSet; 29 import com.google.common.collect.Iterables; 30 import com.google.common.graph.MutableValueGraph; 31 import com.google.common.graph.ValueGraph; 32 import com.google.common.graph.ValueGraphBuilder; 33 import com.google.i18n.phonenumbers.metadata.RangeTree; 34 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; 35 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; 36 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; 37 import java.util.ArrayList; 38 import java.util.Comparator; 39 import java.util.LinkedHashMap; 40 import java.util.List; 41 import java.util.Set; 42 import java.util.function.Function; 43 44 /** 45 * Compiles non-capturing phone number regular expressions into sequences of bytes suitable for 46 * creating {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher 47 * DigitSequenceMatcher} instances. 48 */ 49 public final class MatcherCompiler { 50 /** 51 * Compiles the given {@code RangeTree} into a sequence of bytes suitable for creating a 52 * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher 53 * DigitSequenceMatcher}. 54 */ compile(RangeTree dfa)55 public static byte[] compile(RangeTree dfa) { 56 return compile(dfa, Statistics.NO_OP); 57 } 58 59 /** 60 * As {@link #compile(RangeTree)} but additionally accepts a {@link Statistics} instance 61 * to record metrics about the compilation. 62 */ compile(RangeTree dfa, Statistics stats)63 public static byte[] compile(RangeTree dfa, Statistics stats) { 64 return new MatcherCompiler(dfa).compile(stats); 65 } 66 67 /** The DFA from which the matcher data is to be compiled. */ 68 private final ValueGraph<DfaNode, DfaEdge> dfa; 69 /** The unique initial node of the DFA. */ 70 private final DfaNode init; 71 /** 72 * A map from nodes which are at the beginning of a sequence to that sequence. Not all nodes 73 * will be present in the key set of this map. 74 */ 75 private final ImmutableMap<DfaNode, Sequence> seqStart; 76 77 /** 78 * Builds a graph directly from the DFA in a RangeTree. 79 * 80 * <p>Rather than deal with the DFA tree directly (which is deliberately opaque as a data 81 * structure) we serialize it into a more maleable ValueGraph. This allows simpler graph 82 * traversal while maintaining a simple-as-possible node/edge structure. It's okay to reuse the 83 * RangeTree types {@code DfaNode} and {@code DfaEdge} here because they have the expected 84 * semantics (e.g. conforming to equals/hashcode etc...) but care must be taken not to keep the 85 * instances around for a long time, since this will keep larger parts of the original DFA alive 86 * in the garbage collector (but this is fine since only bytes are returned from this class). 87 */ buildGraph(RangeTree dfa)88 private static ValueGraph<DfaNode, DfaEdge> buildGraph(RangeTree dfa) { 89 Preconditions.checkArgument(!dfa.isEmpty()); 90 MutableValueGraph<DfaNode, DfaEdge> graph = 91 ValueGraphBuilder.directed().allowsSelfLoops(false).build(); 92 graph.addNode(dfa.getInitial()); 93 DfaVisitor visitor = new DfaVisitor() { 94 @Override 95 public void visit(DfaNode source, DfaEdge edge, DfaNode target) { 96 boolean isFirstVisit = graph.addNode(target); 97 graph.putEdgeValue(source, target, edge); 98 if (isFirstVisit) { 99 target.accept(this); 100 } 101 } 102 }; 103 dfa.accept(visitor); 104 return graph; 105 } 106 107 /** 108 * Creates a {@code MatcherCompiler} from the given automaton by generating all the 109 * {@code Sequence}'s of operations necessary to represent it. 110 */ MatcherCompiler(RangeTree ranges)111 MatcherCompiler(RangeTree ranges) { 112 this.dfa = buildGraph(ranges); 113 this.init = ranges.getInitial(); 114 LinkedHashMap<DfaNode, Sequence> start = new LinkedHashMap<>(); 115 buildSequencesFrom(init, start); 116 this.seqStart = ImmutableMap.copyOf(start); 117 } 118 119 /** 120 * Returns the output targets of the given node sorted according to the lowest "accepting" digit 121 * on the corresponding edge. This ordering is necessary for stability, but also correctness when 122 * building mapping operations. Apart from special cases (e.g. only one output) this is the only 123 * method which should be used to obtain output nodes. 124 */ sortedOutputs(DfaNode source)125 private ImmutableSet<DfaNode> sortedOutputs(DfaNode source) { 126 Comparator<DfaNode> ordering = Comparator.comparing( 127 target -> numberOfTrailingZeros(dfa.edgeValue(source, target).get().getDigitMask())); 128 return dfa.successors(source).stream().sorted(ordering).collect(toImmutableSet()); 129 } 130 131 /** Returns the single output target of the given node (or throws an exception). */ singleOutput(DfaNode source)132 private DfaNode singleOutput(DfaNode source) { 133 return Iterables.getOnlyElement(dfa.successors(source)); 134 } 135 136 /** 137 * Builds the output map from a given node in the DFA in the correct order. Note that because 138 * ImmutableSetMultimap.Builder orders keys based on the first time they are added, and we add 139 * keys (nodes) in the order of the input by which they can be reached, the keys of the returned 140 * map are ordered by the lowest digit in their set of values (inputs). This is necessary for 141 * correct behaviour in the "Mapping" operation. 142 */ getOutMap(DfaNode source)143 private ImmutableMap<DfaNode, Integer> getOutMap(DfaNode source) { 144 Function<DfaNode, Integer> getMask = 145 target -> dfa.edgeValue(source, target).get().getDigitMask(); 146 return sortedOutputs(source).stream().collect(toImmutableMap(Function.identity(), getMask)); 147 } 148 149 /** 150 * Recursively builds sequences by traversing the DFA and grouping successive sub-sequences of 151 * nodes which neither branch, nor are branched to. Each such sub-sequence is represented by a 152 * {@code Sequence} instance (a list of non-branching operations, optionally terminated with a 153 * branching operation). 154 */ buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map)155 private void buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map) { 156 if (map.containsKey(start)) { 157 return; 158 } 159 DfaNode current = start; 160 ImmutableList.Builder<DfaNode> nodes = ImmutableList.builder(); 161 while (true) { 162 nodes.add(current); 163 if (dfa.outDegree(current) != 1) { 164 break; 165 } 166 DfaNode next = singleOutput(current); 167 if (dfa.inDegree(next) > 1) { 168 break; 169 } 170 current = next; 171 } 172 Sequence seq = new Sequence(nodes.build()); 173 map.put(start, seq); 174 // Recurse from the outputs at the end of the sequence according to their edge values. 175 // IMPORTANT: We must not use "current.successors()" here since we need the order of insertion 176 // to be well defined and ValueGraph does not make good enough promises about node ordering. 177 for (DfaNode out : sortedOutputs(current)) { 178 buildSequencesFrom(out, map); 179 } 180 } 181 182 /** Creates and compiles a {@code MatcherBytes} instance to render the output bytes. */ compile(Statistics stats)183 byte[] compile(Statistics stats) { 184 return createMatcherBytes(stats).compile(); 185 } 186 187 /** Creates a mutable {@code MatcherBytes} instance which will render the output bytes. */ createMatcherBytes(Statistics stats)188 MatcherBytes createMatcherBytes(Statistics stats) { 189 return new MatcherBytes(seqStart.values(), stats); 190 } 191 192 /** 193 * A contiguous sub-sequence of nodes in the DFA which neither branch, nor are branched to. 194 * <p> 195 * The important property of a {@code Sequence} is that branching may only occur at the end of a 196 * {@code Sequence} and branches may only jump to the start of another {@code Sequence}. This 197 * makes it easier to separate the compilation of operations (inside sequences) from the 198 * management of branches and offsets (between sequences). 199 */ 200 class Sequence { 201 private final ImmutableList<DfaNode> nodes; 202 Sequence(ImmutableList<DfaNode> nodes)203 Sequence(ImmutableList<DfaNode> nodes) { 204 checkArgument(!nodes.isEmpty()); 205 this.nodes = nodes; 206 } 207 getOp(DfaNode node)208 private Operation getOp(DfaNode node) { 209 return Operation.from(node.canTerminate(), getOutMap(node)); 210 } 211 212 /** 213 * Returns the operations representing this sequence, merging successive operations where 214 * possible. The final list of operations is guaranteed to have at most one branching operation 215 * which (if present) will always be the last element in the list. 216 */ createOps()217 List<Operation> createOps() { 218 List<Operation> ops = new ArrayList<>(); 219 Operation current = getOp(nodes.get(0)); 220 for (int n = 1; n < nodes.size(); n++) { 221 Operation next = getOp(nodes.get(n)); 222 Operation merged = current.mergeWith(next); 223 if (merged != null) { 224 current = merged; 225 } else { 226 ops.add(current); 227 current = next; 228 } 229 } 230 ops.add(current); 231 return ops; 232 } 233 getInitialState()234 DfaNode getInitialState() { 235 return Iterables.get(nodes, 0); 236 } 237 getFinalState()238 DfaNode getFinalState() { 239 return Iterables.getLast(nodes); 240 } 241 getOutStates()242 Set<DfaNode> getOutStates() { 243 return sortedOutputs(getFinalState()); 244 } 245 246 /** 247 * Not the same as "terminating" for an operation. A sequence is "final" if no other sequences 248 * follow it. Normally there is only one final sequence in a normalized DFA, even if that 249 * sequence contains only a single terminating node. However not all terminating nodes are 250 * in final sequences. 251 */ isFinal()252 boolean isFinal() { 253 return getOutStates().isEmpty(); 254 } 255 256 /** Returns the number of nodes that this sequence represents. */ size()257 int size() { 258 return nodes.size(); 259 } 260 unorderedOutSequences()261 ImmutableSet<Sequence> unorderedOutSequences() { 262 return getOutStates().stream().map(seqStart::get).collect(toImmutableSet()); 263 } 264 265 @Override toString()266 public String toString() { 267 return toString(new StringBuilder(), 0).toString(); 268 } 269 toString(StringBuilder buf, int indent)270 private StringBuilder toString(StringBuilder buf, int indent) { 271 List<Operation> ops = createOps(); 272 appendIndent(buf, indent).append( 273 String.format("{%s} %s", nodes.get(0), Joiner.on(" >> ").join(ops))); 274 ImmutableList<DfaNode> outs = Iterables.getLast(ops).getOuts(); 275 if (!outs.isEmpty()) { 276 buf.append(" {\n"); 277 for (DfaNode out : outs) { 278 seqStart.get(out).toString(buf, indent + 1); 279 } 280 appendIndent(buf, indent).append("}\n"); 281 } else { 282 buf.append('\n'); 283 } 284 return buf; 285 } 286 } 287 288 @Override toString()289 public String toString() { 290 return seqStart.get(init).toString(); 291 } 292 appendIndent(StringBuilder out, int indent)293 private static StringBuilder appendIndent(StringBuilder out, int indent) { 294 for (int n = 0; n < indent; n++) { 295 out.append(" "); 296 } 297 return out; 298 } 299 } 300