• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler;
18 
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.collect.ImmutableMap.toImmutableMap;
21 import static com.google.common.collect.ImmutableSet.toImmutableSet;
22 import static java.lang.Integer.numberOfTrailingZeros;
23 
24 import com.google.common.base.Joiner;
25 import com.google.common.base.Preconditions;
26 import com.google.common.collect.ImmutableList;
27 import com.google.common.collect.ImmutableMap;
28 import com.google.common.collect.ImmutableSet;
29 import com.google.common.collect.Iterables;
30 import com.google.common.graph.MutableValueGraph;
31 import com.google.common.graph.ValueGraph;
32 import com.google.common.graph.ValueGraphBuilder;
33 import com.google.i18n.phonenumbers.metadata.RangeTree;
34 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
35 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
36 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
37 import java.util.ArrayList;
38 import java.util.Comparator;
39 import java.util.LinkedHashMap;
40 import java.util.List;
41 import java.util.Set;
42 import java.util.function.Function;
43 
44 /**
45  * Compiles non-capturing phone number regular expressions into sequences of bytes suitable for
46  * creating {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher
47  * DigitSequenceMatcher} instances.
48  */
49 public final class MatcherCompiler {
50   /**
51    * Compiles the given {@code RangeTree} into a sequence of bytes suitable for creating a
52    * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher
53    * DigitSequenceMatcher}.
54    */
compile(RangeTree dfa)55   public static byte[] compile(RangeTree dfa) {
56     return compile(dfa, Statistics.NO_OP);
57   }
58 
59   /**
60    * As {@link #compile(RangeTree)} but additionally accepts a {@link Statistics} instance
61    * to record metrics about the compilation.
62    */
compile(RangeTree dfa, Statistics stats)63   public static byte[] compile(RangeTree dfa, Statistics stats) {
64     return new MatcherCompiler(dfa).compile(stats);
65   }
66 
67   /** The DFA from which the matcher data is to be compiled. */
68   private final ValueGraph<DfaNode, DfaEdge> dfa;
69   /** The unique initial node of the DFA. */
70   private final DfaNode init;
71   /**
72    * A map from nodes which are at the beginning of a sequence to that sequence. Not all nodes
73    * will be present in the key set of this map.
74    */
75   private final ImmutableMap<DfaNode, Sequence> seqStart;
76 
77   /**
78    * Builds a graph directly from the DFA in a RangeTree.
79    *
80    * <p>Rather than deal with the DFA tree directly (which is deliberately opaque as a data
81    * structure) we serialize it into a more maleable ValueGraph. This allows simpler graph
82    * traversal while maintaining a simple-as-possible node/edge structure. It's okay to reuse the
83    * RangeTree types {@code DfaNode} and {@code DfaEdge} here because they have the expected
84    * semantics (e.g. conforming to equals/hashcode etc...) but care must be taken not to keep the
85    * instances around for a long time, since this will keep larger parts of the original DFA alive
86    * in the garbage collector (but this is fine since only bytes are returned from this class).
87    */
buildGraph(RangeTree dfa)88   private static ValueGraph<DfaNode, DfaEdge> buildGraph(RangeTree dfa) {
89     Preconditions.checkArgument(!dfa.isEmpty());
90     MutableValueGraph<DfaNode, DfaEdge> graph =
91         ValueGraphBuilder.directed().allowsSelfLoops(false).build();
92     graph.addNode(dfa.getInitial());
93     DfaVisitor visitor = new DfaVisitor() {
94       @Override
95       public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
96         boolean isFirstVisit = graph.addNode(target);
97         graph.putEdgeValue(source, target, edge);
98         if (isFirstVisit) {
99           target.accept(this);
100         }
101       }
102     };
103     dfa.accept(visitor);
104     return graph;
105   }
106 
107   /**
108    * Creates a {@code MatcherCompiler} from the given automaton by generating all the
109    * {@code Sequence}'s of operations necessary to represent it.
110    */
MatcherCompiler(RangeTree ranges)111   MatcherCompiler(RangeTree ranges) {
112     this.dfa = buildGraph(ranges);
113     this.init = ranges.getInitial();
114     LinkedHashMap<DfaNode, Sequence> start = new LinkedHashMap<>();
115     buildSequencesFrom(init, start);
116     this.seqStart = ImmutableMap.copyOf(start);
117   }
118 
119   /**
120    * Returns the output targets of the given node sorted according to the lowest "accepting" digit
121    * on the corresponding edge. This ordering is necessary for stability, but also correctness when
122    * building mapping operations. Apart from special cases (e.g. only one output) this is the only
123    * method which should be used to obtain output nodes.
124    */
sortedOutputs(DfaNode source)125   private ImmutableSet<DfaNode> sortedOutputs(DfaNode source) {
126     Comparator<DfaNode> ordering = Comparator.comparing(
127         target -> numberOfTrailingZeros(dfa.edgeValue(source, target).get().getDigitMask()));
128     return dfa.successors(source).stream().sorted(ordering).collect(toImmutableSet());
129   }
130 
131   /** Returns the single output target of the given node (or throws an exception). */
singleOutput(DfaNode source)132   private DfaNode singleOutput(DfaNode source) {
133     return Iterables.getOnlyElement(dfa.successors(source));
134   }
135 
136   /**
137    * Builds the output map from a given node in the DFA in the correct order. Note that because
138    * ImmutableSetMultimap.Builder orders keys based on the first time they are added, and we add
139    * keys (nodes) in the order of the input by which they can be reached, the keys of the returned
140    * map are ordered by the lowest digit in their set of values (inputs). This is necessary for
141    * correct behaviour in the "Mapping" operation.
142    */
getOutMap(DfaNode source)143   private ImmutableMap<DfaNode, Integer> getOutMap(DfaNode source) {
144     Function<DfaNode, Integer> getMask =
145         target -> dfa.edgeValue(source, target).get().getDigitMask();
146     return sortedOutputs(source).stream().collect(toImmutableMap(Function.identity(), getMask));
147   }
148 
149   /**
150    * Recursively builds sequences by traversing the DFA and grouping successive sub-sequences of
151    * nodes which neither branch, nor are branched to. Each such sub-sequence is represented by a
152    * {@code Sequence} instance (a list of non-branching operations, optionally terminated with a
153    * branching operation).
154    */
buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map)155   private void buildSequencesFrom(DfaNode start, LinkedHashMap<DfaNode, Sequence> map) {
156     if (map.containsKey(start)) {
157       return;
158     }
159     DfaNode current = start;
160     ImmutableList.Builder<DfaNode> nodes = ImmutableList.builder();
161     while (true) {
162       nodes.add(current);
163       if (dfa.outDegree(current) != 1) {
164         break;
165       }
166       DfaNode next = singleOutput(current);
167       if (dfa.inDegree(next) > 1) {
168         break;
169       }
170       current = next;
171     }
172     Sequence seq = new Sequence(nodes.build());
173     map.put(start, seq);
174     // Recurse from the outputs at the end of the sequence according to their edge values.
175     // IMPORTANT: We must not use "current.successors()" here since we need the order of insertion
176     // to be well defined and ValueGraph does not make good enough promises about node ordering.
177     for (DfaNode out : sortedOutputs(current)) {
178       buildSequencesFrom(out, map);
179     }
180   }
181 
182   /** Creates and compiles a {@code MatcherBytes} instance to render the output bytes. */
compile(Statistics stats)183   byte[] compile(Statistics stats) {
184     return createMatcherBytes(stats).compile();
185   }
186 
187   /** Creates a mutable {@code MatcherBytes} instance which will render the output bytes. */
createMatcherBytes(Statistics stats)188   MatcherBytes createMatcherBytes(Statistics stats) {
189     return new MatcherBytes(seqStart.values(), stats);
190   }
191 
192   /**
193    * A contiguous sub-sequence of nodes in the DFA which neither branch, nor are branched to.
194    * <p>
195    * The important property of a {@code Sequence} is that branching may only occur at the end of a
196    * {@code Sequence} and branches may only jump to the start of another {@code Sequence}. This
197    * makes it easier to separate the compilation of operations (inside sequences) from the
198    * management of branches and offsets (between sequences).
199    */
200   class Sequence {
201     private final ImmutableList<DfaNode> nodes;
202 
Sequence(ImmutableList<DfaNode> nodes)203     Sequence(ImmutableList<DfaNode> nodes) {
204       checkArgument(!nodes.isEmpty());
205       this.nodes = nodes;
206     }
207 
getOp(DfaNode node)208     private Operation getOp(DfaNode node) {
209       return Operation.from(node.canTerminate(), getOutMap(node));
210     }
211 
212     /**
213      * Returns the operations representing this sequence, merging successive operations where
214      * possible. The final list of operations is guaranteed to have at most one branching operation
215      * which (if present) will always be the last element in the list.
216      */
createOps()217     List<Operation> createOps() {
218       List<Operation> ops = new ArrayList<>();
219       Operation current = getOp(nodes.get(0));
220       for (int n = 1; n < nodes.size(); n++) {
221         Operation next = getOp(nodes.get(n));
222         Operation merged = current.mergeWith(next);
223         if (merged != null) {
224           current = merged;
225         } else {
226           ops.add(current);
227           current = next;
228         }
229       }
230       ops.add(current);
231       return ops;
232     }
233 
getInitialState()234     DfaNode getInitialState() {
235       return Iterables.get(nodes, 0);
236     }
237 
getFinalState()238     DfaNode getFinalState() {
239       return Iterables.getLast(nodes);
240     }
241 
getOutStates()242     Set<DfaNode> getOutStates() {
243       return sortedOutputs(getFinalState());
244     }
245 
246     /**
247      * Not the same as "terminating" for an operation. A sequence is "final" if no other sequences
248      * follow it. Normally there is only one final sequence in a normalized DFA, even if that
249      * sequence contains only a single terminating node. However not all terminating nodes are
250      * in final sequences.
251      */
isFinal()252     boolean isFinal() {
253       return getOutStates().isEmpty();
254     }
255 
256     /** Returns the number of nodes that this sequence represents. */
size()257     int size() {
258       return nodes.size();
259     }
260 
unorderedOutSequences()261     ImmutableSet<Sequence> unorderedOutSequences() {
262       return getOutStates().stream().map(seqStart::get).collect(toImmutableSet());
263     }
264 
265     @Override
toString()266     public String toString() {
267       return toString(new StringBuilder(), 0).toString();
268     }
269 
toString(StringBuilder buf, int indent)270     private StringBuilder toString(StringBuilder buf, int indent) {
271       List<Operation> ops = createOps();
272       appendIndent(buf, indent).append(
273           String.format("{%s} %s", nodes.get(0), Joiner.on(" >> ").join(ops)));
274       ImmutableList<DfaNode> outs = Iterables.getLast(ops).getOuts();
275       if (!outs.isEmpty()) {
276         buf.append(" {\n");
277         for (DfaNode out : outs) {
278           seqStart.get(out).toString(buf, indent + 1);
279         }
280         appendIndent(buf, indent).append("}\n");
281       } else {
282         buf.append('\n');
283       }
284       return buf;
285     }
286   }
287 
288   @Override
toString()289   public String toString() {
290     return seqStart.get(init).toString();
291   }
292 
appendIndent(StringBuilder out, int indent)293   private static StringBuilder appendIndent(StringBuilder out, int indent) {
294     for (int n = 0; n < indent; n++) {
295       out.append("  ");
296     }
297     return out;
298   }
299 }
300