• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.i18n.phonenumbers.metadata;
17 
18 import static com.google.common.base.Preconditions.checkNotNull;
19 import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES;
20 
21 import com.google.common.collect.ImmutableList;
22 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge;
23 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode;
24 import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor;
25 import java.util.ArrayList;
26 import java.util.List;
27 
28 /**
29  * Factor a range tree into a sequence of trees which attempts to minimize overall complexity in
30  * the face of non-determinism. This can be used to reduce the size of any generated regular
31  * expressions.
32  */
33 public final class RangeTreeFactorizer {
34   /** Strategies to control how merging is achieved when building factors.*/
35   public enum MergeStrategy {
36     /**
37      * Edges are only merged if they accept exactly the same set of digits. If the existing factor
38      * contains "[0-5]" it will not be merged with the candidate edge "[0-8]".
39      */
40     REQUIRE_EQUAL_EDGES,
41     /**
42      * Edges can be merged if the candidate edge accepts more digits than the existing edge. If the
43      * existing factor contains "[0-5]" and the candidate edge is "[0-8]", the candidate edge is
44      * split so that "[0-5]" is merged as normal and an additional edge "[6-8]" is branched off.
45      */
46     ALLOW_EDGE_SPLITTING,
47   }
48 
49   /**
50    * Factors the given range tree.
51    * <p>
52    * Paths are processed longest-first, and a path belongs in particular "factor" if it can be
53    * added without "causing a split" in the existing factor. For example, given an existing factor
54    * {@code {"12[3-6]x", "45xx"}}:
55    * <ul>
56    *   <li>The path "12[3-6]" can be added, since it is a prefix of one of the existing paths in
57    *   the DFA.
58    *   <li>The path "13xx" can be added since it forms a new branch in the DFA, which does not
59    *   affect any existing branches ("13..." is disjoint with "12...").
60    *   <li>The path "12[34]" cannot be added since it would "split" the existing path
61    *   "12[3-6]x" in the DFA ("[34]" is a subset of "[3-6]"). "
62    *   <li>Depending on the merge strategy, the path "12[0-6]x" might be added ("[0-6]" is a
63    *   superset of "[3-6]"). See {@link MergeStrategy} for more information.
64    * </ul>
65    */
factor(RangeTree ranges, MergeStrategy strategy)66   public static ImmutableList<RangeTree> factor(RangeTree ranges, MergeStrategy strategy) {
67     // If only one length on all paths, the DFA is already "factored".
68     if (ranges.getLengths().size() == 1) {
69       return ImmutableList.of(ranges);
70     }
71     List<RangeTree> factors = new ArrayList<>();
72     // Start with the "naive" factors (splitting by length) from longest to shortest.
73     for (int n : ranges.getLengths().descendingSet()) {
74       factors.add(ranges.intersect(RangeTree.from(RangeSpecification.any(n))));
75     }
76     // Now attempt to merge as much of each of the shorter factors as possible into the longer ones.
77     // In each loop we subsume a candidate factor into previous factors, either in whole or in part.
78     int index = 1;
79     while (index < factors.size()) {
80       // Merge (as much as possible) each "naive" factor into earlier factors.
81       RangeTree r = factors.get(index);
82       for (int n = 0; n < index && !r.isEmpty(); n++) {
83         RangeTree merged = new RangeTreeFactorizer(factors.get(n), strategy).mergeFrom(r);
84         factors.set(n, merged);
85         // Calculate the ranges which haven't yet been merged into any earlier factor.
86         r = r.subtract(merged);
87       }
88       if (r.isEmpty()) {
89         // All ranges merged, so remove the original factor (index now references the next factor).
90         factors.remove(index);
91       } else {
92         // We have some un-factorable ranges which are kept to start a new factor.
93         factors.set(index, r);
94         index++;
95       }
96     }
97     return ImmutableList.copyOf(factors);
98   }
99 
100   // This is modified as paths are added.
101   private RangeTree factor;
102   private final MergeStrategy strategy;
103 
RangeTreeFactorizer(RangeTree factor, MergeStrategy strategy)104   RangeTreeFactorizer(RangeTree factor, MergeStrategy strategy) {
105     this.factor = checkNotNull(factor);
106     this.strategy = strategy;
107   }
108 
mergeFrom(RangeTree ranges)109   RangeTree mergeFrom(RangeTree ranges) {
110     recursivelyMerge(ranges.getInitial(), factor.getInitial(), RangeSpecification.empty());
111     return factor;
112   }
113 
recursivelyMerge(DfaNode srcNode, DfaNode dstNode, RangeSpecification path)114   void recursivelyMerge(DfaNode srcNode, DfaNode dstNode, RangeSpecification path) {
115     if (srcNode.canTerminate()) {
116       factor = factor.union(RangeTree.from(path));
117     } else {
118       srcNode.accept(new FactoringVisitor(dstNode, path));
119     }
120   }
121 
122   private final class FactoringVisitor implements DfaVisitor {
123     private final RangeSpecification path;
124     private final DfaNode dstNode;
125 
126     // True if we encountered a situation when an edge we are merging (srcMask) has a partial
127     // overlap with the existing edge (dstMask) (e.g. merging "[0-6]" into "[4-9]"). This is
128     // distinct from the case where the existing edge is a subset of the edge being merged (e.g.
129     // merging "[0-6]" into "[2-4]", where the edge being merged can be split into "[0156]" and
130     // "[2-4]"). In either strategy, a partial overlap will prevent merging.
131     private boolean partialOverlap = false;
132 
133     // Records the union of all edge ranges visited for the current node. This is used to determine
134     // the remaining edges that must be added after visiting the existing factor (especially in the
135     // case of ALLOW_EDGE_SPLITTING).
136     private int allDstMask = 0;
137 
FactoringVisitor(DfaNode dstNode, RangeSpecification path)138     FactoringVisitor(DfaNode dstNode, RangeSpecification path) {
139       this.dstNode = dstNode;
140       this.path = path;
141     }
142 
143     @Override
visit(DfaNode source, DfaEdge srcEdge, DfaNode srcTarget)144     public void visit(DfaNode source, DfaEdge srcEdge, DfaNode srcTarget) {
145       int srcMask = srcEdge.getDigitMask();
146       dstNode.accept((s, dstEdge, dstTarget) -> {
147         int dstMask = dstEdge.getDigitMask();
148         if ((strategy == REQUIRE_EQUAL_EDGES) ? (dstMask == srcMask) : (dstMask & ~srcMask) == 0) {
149           // The set of digits accepted by the edge being merged (mask) is equal-to or a superset
150           // of the digits of the edge in the factor we are merging into. The path is extended by
151           // the destination edge because during recursion we only follow paths already in the
152           // factor.
153           recursivelyMerge(srcTarget, dstTarget, path.extendByMask(dstMask));
154         } else {
155           partialOverlap |= (dstMask & srcMask) != 0;
156         }
157         allDstMask |= dstMask;
158       });
159       if (!partialOverlap) {
160         // Work out the digits that weren't in any of the edges of the factor we were processing
161         // and merge the sub-tree under that edge into the current factor. For REQUIRE_EQUAL_EDGES
162         // the extraMask is always either srcMask or 0 (since the edge was either added in full,
163         // or disjoint with all the existing edges). For ALLOW_EDGE_SPLITTING it's the remaining
164         // range that wasn't merged with any of the existing paths.
165         int extraMask = srcMask & ~allDstMask;
166         if (extraMask != 0) {
167           new MergingVisitor(path).recurse(srcTarget, extraMask);
168         }
169       }
170     }
171   }
172 
173   private final class MergingVisitor implements DfaVisitor {
174     private final RangeSpecification path;
175 
MergingVisitor(RangeSpecification path)176     MergingVisitor(RangeSpecification path) {
177       this.path = checkNotNull(path);
178     }
179 
recurse(DfaNode node, int mask)180     void recurse(DfaNode node, int mask) {
181       RangeSpecification newPath = path.extendByMask(mask);
182       if (node.canTerminate()) {
183         factor = factor.union(RangeTree.from(newPath));
184       } else {
185         node.accept(new MergingVisitor(newPath));
186       }
187     }
188 
189     @Override
visit(DfaNode source, DfaEdge edge, DfaNode target)190     public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
191       recurse(target, edge.getDigitMask());
192     }
193   }
194 }
195