• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Libphonenumber Authors.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 package com.google.i18n.phonenumbers.metadata;
17 
18 import static com.google.common.base.Preconditions.checkArgument;
19 import static com.google.common.base.Preconditions.checkNotNull;
20 import static com.google.common.base.Preconditions.checkState;
21 import static com.google.common.collect.ImmutableList.toImmutableList;
22 import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK;
23 import static java.lang.Integer.numberOfTrailingZeros;
24 
25 import com.google.common.base.Preconditions;
26 import com.google.common.base.Supplier;
27 import com.google.common.base.Suppliers;
28 import com.google.common.collect.ImmutableList;
29 import com.google.common.collect.ImmutableRangeSet;
30 import com.google.common.collect.ImmutableSortedSet;
31 import com.google.common.collect.RangeSet;
32 import java.lang.ref.Reference;
33 import java.lang.ref.ReferenceQueue;
34 import java.lang.ref.WeakReference;
35 import java.util.ArrayList;
36 import java.util.Comparator;
37 import java.util.List;
38 import java.util.Map;
39 import java.util.concurrent.ConcurrentHashMap;
40 import java.util.function.BiFunction;
41 import java.util.function.Function;
42 import java.util.stream.Stream;
43 import javax.annotation.Nullable;
44 
45 /**
46  * Minimal decision tree for matching digit sequences. A range tree represents an arbitrary set of
47  * digit sequences typically grouped as a set of disjoint ranges. A range tree can be thought of
48  * as equivalent to either {@code RangeSet<DigitSequence>} or a canonical
49  * {@code List<RangeSpecification>}. Range trees can have set-like operations performed on them,
50  * such as union and intersection.
51  */
52 public final class RangeTree {
53   /**
54    * Simple API for representing nodes in the DFA during visitation. See also {@link DfaVisitor}
55    * and {@link #accept(DfaVisitor)}.
56    */
57   public interface DfaNode {
58     /** Returns whether this node can terminate. */
canTerminate()59     boolean canTerminate();
60 
61     /** Accepts the given visitor on this node, visiting its immediate outgoing (child) edges. */
accept(DfaVisitor visitor)62     void accept(DfaVisitor visitor);
63 
64     /** Finds the outgoing edge from this node which contains the given digit. */
65     @Nullable
find(int digit)66     DfaEdge find(int digit);
67 
68     /** Returns the list of edges leading out from this node.*/
getEdges()69     List<DfaEdge> getEdges();
70 
71     /**
72      * Returns a bit-mask of possible lengths from this node. Bit-N is set of the sub-tree rooted
73      * at this node can terminate (in any branch) at depth N. A corollary to this is that bit-0 is
74      * set if this node can terminate.
75      */
getLengthMask()76     int getLengthMask();
77   }
78 
79   /**
80    * Simple API for representing edges in the DFA during visitation. See also {@link DfaVisitor}
81    * and {@link #accept(DfaVisitor)}.
82    */
83   public interface DfaEdge {
84     /** Returns a bit-mask of the digits accepted by this edge. */
getDigitMask()85     int getDigitMask();
86 
87     /** Returns the target node of this edge. */
getTarget()88     DfaNode getTarget();
89   }
90 
91   /**
92    * Visitor API for traversing edges in {@code RangeTrees}. When a node accepts a visitor, it
93    * visits only the immediate outgoing edges of that node. If recursive visitation is required, it
94    * is up to the visitor to call {@link DfaNode#accept(DfaVisitor)} during visitation.
95    *
96    * <p>Graph nodes and edges obey {@code Object} equality and this can be used by visitor
97    * implementations to track which nodes have been reached if they only wish to visit each edge
98    * once (e.g. storing visited nodes in a set or map).
99    */
100   public interface DfaVisitor {
101     /**
102      * Visits an edge in the DFA graph of a {@code RangeTree} as the result of a call to
103      * {@link DfaNode#accept(DfaVisitor)} on the source node.
104      */
visit(DfaNode source, DfaEdge edge, DfaNode target)105     void visit(DfaNode source, DfaEdge edge, DfaNode target);
106   }
107 
108   /**
109    * A single node within a range tree. Nodes are really just a specialized implementation of a
110    * node in a deterministic finite state automaton (DFA), and {@link RangeTree} instances are just
111    * wrappers around a single "root" node.
112    * <p>
113    * Nodes have outgoing {@link Edge}s but may optionally allow termination of matching operations
114    * when they are reached (this is the same as in DFAs). Unlike DFAs however, the out-edges of a
115    * node are grouped according to a mask of all digits which can reach the same target node. For
116    * node {@code A}, which can reach a target node {@code B} via digits {@code {1, 2, 3, 7, 9 }},
117    * there is a single edge labeled with the bitmask {@code 0x287} (binary {@code 1010001110}) as
118    * opposed to 5 separate edges, each marked with a single digit.
119    * <p>
120    * This approach more closely matches the data representations in classes like {@link
121    * RangeSpecification} and affords additional efficiencies compared to the {@code JAutomata}
122    * library, which has performance issues when processing large trees.
123    */
124   private static final class Node implements DfaNode {
125     /** The unique "terminal" node which must be the final node in all paths in all RangeTrees. */
126     private static final Node TERMINAL = new Node();
127 
128     /**
129      * The list of edges, ordered by the lowest bit in each mask. The masks for each edge must be
130      * mutually disjoint. Only the terminal node is permitted to have zero outbound edges.
131      */
132     private final ImmutableList<Edge> edges;
133 
134     /**
135      * Derived bit-packed table of digit-number to edge-index. The edge index for digit {@code 'd'}
136      * is stored in 4-bits, starting at bit {@code 4 * d}. This is useful as it makes finding an
137      * outbound edge a constant time operation (instead of having to search through the list of
138      * edges).
139      */
140     private final long jumpTable;
141 
142     /** A cached value of the total number of unique digits sequences matched by this DFA. */
143     private final long matchCount;
144 
145     /**
146      * Mask of all possible lengths of digit sequences this node can match. If bit-N is set then
147      * this node can match at least one input sequence of length N. Note that this includes bit-0,
148      * which is matched if the node itself can terminate. It is even possible to have a range tree
149      * containing only the terminal node which matches only the empty digit sequence (this is
150      * distinct from the "empty" tree which matches no sequences at all).
151      */
152     private final int lengthMask;
153 
154     /** Nodes are used a keys during graph interning so we cache the hashcode. */
155     private final int hashcode;
156 
157     // Only for the terminal node.
Node()158     private Node() {
159       this.edges = ImmutableList.of();
160       this.jumpTable = -1L;
161       // Unlike the empty tree, the terminal node matches one input sequence, the empty sequence.
162       // The empty tree on the other hand doesn't even reference a terminal node, so there is no
163       // possible sequence it can match.
164       this.matchCount = 1L;
165       this.lengthMask = 1;
166       this.hashcode = -1;
167     }
168 
169     // A node is defined entirely from its set of edges and whether it can terminate.
Node(ImmutableList<Edge> edges, boolean canTerminate)170     private Node(ImmutableList<Edge> edges, boolean canTerminate) {
171       checkArgument(!edges.isEmpty());
172       this.edges = edges;
173       // Everything below here is derived information from the edges and termination flag.
174       int lengthMask = 0;
175       // Set all bits in the jump table (so each 4-bit entry is '-1' unless otherwise overwritten).
176       long jumpTable = -1L;
177       int outMask = 0;
178       int lastLowBit = -1;
179       // If we can terminate we get an additional match count for the sequence that reaches us, but
180       // we may get more for longer sequences matched by nodes we link to.
181       long matchCount = canTerminate ? 1 : 0;
182       for (int n = 0; n < edges.size(); n++) {
183         Edge e = edges.get(n);
184         // Make sure edges are disjoint (edges masks are already known to be valid individually).
185         checkArgument((outMask & e.mask) == 0, "edge masks not disjoint: %s", e);
186         outMask |= e.mask;
187         // Make sure edges are ordered as expected.
188         int lowBit = numberOfTrailingZeros(e.mask);
189         checkArgument(lowBit > lastLowBit, "edge masks not ordered: %s", e);
190         lastLowBit = lowBit;
191         // Work out what the match count is based on the counts of everything we link to (the sum
192         // of all the counts of our target nodes weighted by how many times we link to them).
193         matchCount += Integer.bitCount(e.mask) * e.target.matchCount;
194         // Build up a mask of all the lengths that our target node can contain.
195         lengthMask |= e.target.lengthMask;
196         // For each bit in the edge mask, set the 4-bit nibble in the jump table to the edge index.
197         for (int d = 0; d <= 9; d++) {
198           if ((e.mask & (1 << d)) != 0) {
199             // We rely on the jump table entry having all its bits set here (true from above).
200             // n = 1010 (9).
201             // (n ^ 1111) << (4d) == 0000.0101.0000...
202             // xxxx.1111.yyyy... ^ 0000.0101.0000... == xxxx.1010.yyyy
203             jumpTable ^= (n ^ 0xFL) << (4 * d);
204           }
205         }
206       }
207       this.jumpTable = jumpTable;
208       this.matchCount = matchCount;
209       // Our length set is one more than all our targets (including bit zero if we can terminate).
210       this.lengthMask = (lengthMask << 1) | (canTerminate ? 1 : 0);
211       // Caching the hashcode makes interning faster (note that this is not recursive because the
212       // hashcode of an Edge relies on the identity hashcode of the target nodes).
213       this.hashcode = edges.hashCode() ^ Boolean.hashCode(canTerminate);
214     }
215 
216     /**
217      * Returns the target node for the given input digit, or {@code null} if there is no out-edge
218      * for that digit.
219      */
findTarget(int digit)220     Node findTarget(int digit) {
221       checkArgument(0 <= digit && digit <= 9);
222       return targetFromJumpTableIndex((int) (jumpTable >>> (4 * digit)) & 0xF);
223     }
224 
225     /** Helper to get the target node from an edge index (rather than a digit value). */
targetFromJumpTableIndex(int n)226     private Node targetFromJumpTableIndex(int n) {
227       return (n != 0xF) ? edges.get(n).target : null;
228     }
229 
230     @Nullable
231     @Override
find(int digit)232     public DfaEdge find(int digit) {
233       checkArgument(0 <= digit && digit <= 9);
234       int jumpTableIndex = (int) (jumpTable >>> (4 * digit)) & 0xF;
235       return (jumpTableIndex != 0xF) ? edges.get(jumpTableIndex) : null;
236     }
237 
238     @Override
canTerminate()239     public boolean canTerminate() {
240       return (lengthMask & 1) != 0;
241     }
242 
243     @Override
accept(DfaVisitor visitor)244     public void accept(DfaVisitor visitor) {
245       for (Edge e : edges) {
246         visitor.visit(this, e, e.target);
247       }
248     }
249 
250     @Override
getEdges()251     public List<DfaEdge> getEdges() {
252       // NOTE: This DOES NOT make a copy (or any allocations), since ImmutableList is clever
253       // enough to know that a list of <Bar extends Foo> is also a List<Foo> if they are
254       // unmodifiable. It's a clever cast essentially!
255       return ImmutableList.copyOf(edges);
256     }
257 
258     @Override
getLengthMask()259     public int getLengthMask() {
260       return lengthMask;
261     }
262 
263     /**
264      * Returns whether this node is interchangeable with the given instance. Equality of
265      * {@code Node} instances is not "deep" equality and is carefully designed to make constructing
266      * minimal range trees easier. Two nodes are equal if they have the same set of edges and
267      * termination flag; however edges are equal only if they point to exactly the same target
268      * instances. This is carefully designed to make "interning" work efficiently and avoid
269      * unwanted recursion during various operations.
270      */
271     @Override
equals(Object o)272     public boolean equals(Object o) {
273       if (this == o) {
274         return true;
275       }
276       if (!(o instanceof Node)) {
277         return false;
278       }
279       Node tree = (Node) o;
280       return edges.equals(tree.edges) && canTerminate() == tree.canTerminate();
281     }
282 
283     @Override
hashCode()284     public int hashCode() {
285       return hashcode;
286     }
287   }
288 
289   /**
290    * A directed edge to a target {@link Node}. Note that edge equality is based on instance identity
291    * as part of the interning semantics of range trees.
292    */
293   private static final class Edge implements DfaEdge {
294     /** Bit mask of digit values this edge accepts (bit-N set implies this edge accepts digit N). */
295     private final int mask;
296     /** Target node (or node function) this edge points at. */
297     private final Node target;
298 
Edge(int mask, Node target)299     private Edge(int mask, Node target) {
300       checkArgument(mask > 0 && mask <= RangeSpecification.ALL_DIGITS_MASK);
301       this.mask = mask;
302       this.target = checkNotNull(target);
303     }
304 
305     /** Returns a new edge with the same target whose mask is OR'ed with the given value. */
merge(int m)306     Edge merge(int m) {
307       return new Edge(mask | m, target);
308     }
309 
310     @Override
getDigitMask()311     public int getDigitMask() {
312       return mask;
313     }
314 
315     @Override
getTarget()316     public DfaNode getTarget() {
317       return target;
318     }
319 
320     /**
321      * Edges are equal only if the point to exactly the same targets. This is important to avoid
322      * expensive recursive equality checking during common operations.
323      */
324     @Override
325     @SuppressWarnings("ReferenceEquality")
equals(Object o)326     public boolean equals(Object o) {
327       if (!(o instanceof Edge)) {
328         return false;
329       }
330       Edge other = (Edge) o;
331       return mask == other.mask && target == other.target;
332     }
333 
334     @Override
hashCode()335     public int hashCode() {
336       return mask ^ System.identityHashCode(target);
337     }
338 
339     /** The natural string representation of an edge is the set of digits it accepts. */
340     @Override
toString()341     public String toString() {
342       return RangeSpecification.toString(mask);
343     }
344   }
345 
346   /**
347    * Implementation of set-like operations for range trees. As well as having well defined set-like
348    * operations (union, intersection etc...), range trees are also minimal DFAs, ensuring that no
349    * duplication of sub-trees occurs. This class implements set-like operations efficiently using
350    * recursive interning of nodes and always produces minimal results by construction. This
351    * approach is similar to (but not the same as) dynamic programming.
352    * <p>
353    * Note that the terms "interning" and "minimizing" of range trees are related but not the same.
354    * Interning only makes sense in relation to some sequence of operations (and is a pure
355    * implementation detail for efficiency reasons). A minimal sub-tree exists outside of logical
356    * operations and may be the result of a logical operation, or something else. Minimization of
357    * the DFA represented by a range tree is a user concept.
358    * <p>
359    * Currently all logical operations produce minimal DFAs, but a minimal DFA can come from other
360    * sources (the DFA generated for a single range specification is minimal by construction).
361    */
362   public static final class SetOperations {
363     // A weak reference wrapper around a Node allowing it to be garbage collected when not needed.
364     private static final class WeakNodeRef extends WeakReference<Node> {
365       // This MUST be cached since it cannot be calculated once the Node is garbage collected, and
366       // it's required that keys in maps do not change their hashcode.
367       private final int hashCode;
368 
WeakNodeRef(Node referent, ReferenceQueue<? super Node> q)369       public WeakNodeRef(Node referent, ReferenceQueue<? super Node> q) {
370         super(checkNotNull(referent), q);
371         this.hashCode = referent.hashCode();
372       }
373 
374       @Override
hashCode()375       public int hashCode() {
376         return hashCode;
377       }
378 
379       /*
380        * This is very subtle. To avoid multiple "cleared" references becoming equal to each other
381        * (which violates the expectations of a map) we consider other nodes equal only if either:
382        * 1) they are the same instance
383        * 2) they have a non-null referent that's equal() to our referent
384        *
385        * Two distinct cleared references with the same hashcode must not compare as equal.
386        */
387       @Override
equals(Object obj)388       public boolean equals(Object obj) {
389         if (obj == this) {
390           return true;
391         }
392         // Don't worry about checking "instanceof" since this is a private type.
393         Node referent = get();
394         return referent != null && referent.equals(((WeakNodeRef) obj).get());
395       }
396     }
397 
398     /** Minimal API for any logical operation which can be applied to two range trees. */
399     interface LogicalOperation extends BiFunction<Node, Node, Node> { }
400 
401     /**
402      * Implementation of the "union" operation for two sub-trees. The union of two sub-trees is a
403      * sub-tree which matches digit sequences if-and-only-if they are matched by either of the
404      * original sub-trees.
405      */
406     private final class Union implements LogicalOperation {
407       @Override
408       @SuppressWarnings("ReferenceEquality")
apply(Node lhs, Node rhs)409       public Node apply(Node lhs, Node rhs) {
410         // Assert that inputs are always interned.
411         // NOTE: It might be worth doing checks for "TERMINAL" here as well.
412         if (lhs == rhs || rhs == null) {
413           // (A ∪ A) = A and (A ∪ ∅) = A
414           return lhs;
415         } else if (lhs == null) {
416           // (∅ ∪ B) = B
417           return rhs;
418         }
419         return recurse(this, lhs, rhs, lhs.canTerminate() || rhs.canTerminate());
420       }
421     }
422 
423     /**
424      * Implementation of the "intersection" operation for two sub-trees. The intersection of two
425      * sub-trees is a sub-tree which matches digit sequences if-and-only-if they are matched by
426      * both the original sub-trees.
427      */
428     private final class Intersection implements LogicalOperation {
429       @Override
430       @SuppressWarnings("ReferenceEquality")
apply(Node lhs, Node rhs)431       public Node apply(Node lhs, Node rhs) {
432         // Assert that inputs are always interned.
433         // NOTE: It might be worth doing checks for "TERMINAL" here as well.
434         if (lhs == rhs) {
435           // (A ∩ A) = A
436           return lhs;
437         } else if (lhs == null || rhs == null) {
438           // (∅ ∩ X) = ∅ for any X
439           return null;
440         }
441         return recurse(this, lhs, rhs, lhs.canTerminate() && rhs.canTerminate());
442       }
443     }
444 
445     /**
446      * Implementation of the "subtraction" operation for two sub-trees. The subtraction of two
447      * sub-trees {@code A} and {@code B}, is a sub-tree which matches digit sequences if-and-only-if
448      * they are matched by {@code A} but not {@code B}. This is not a symmetrical operation.
449      */
450     private final class Subtraction implements LogicalOperation {
451       @Override
452       @SuppressWarnings("ReferenceEquality")
apply(Node lhs, Node rhs)453       public Node apply(Node lhs, Node rhs) {
454         // Assert that inputs are always interned.
455         // NOTE: It might be worth doing checks for "TERMINAL" here as well.
456         if (lhs == rhs || lhs == null) {
457           // (A ∖ A) = ∅ and (∅ ∖ B) = ∅
458           return null;
459         } else if (rhs == null) {
460           // (A ∖ ∅) = A
461           return lhs;
462         }
463         return recurse(this, lhs, rhs, lhs.canTerminate() && !rhs.canTerminate());
464       }
465     }
466 
467     private final class Filter implements LogicalOperation {
468       // IMPORTANT: The prefix is neither returned, nor tested directly against instances in the
469       // range tree being filtered (other than the singleton TERMINAL node) which means it need
470       // not be interned before calling this function. If this method were ever changed to return
471       // nodes from the prefix tree or test instance equality with (interned) nodes in the range
472       // tree, then the prefix tree must also be interned before this method is called.
473       @Override
474       @SuppressWarnings("ReferenceEquality")
apply(Node prefix, Node range)475       public Node apply(Node prefix, Node range) {
476         // Assert that ranges are always interned (prefixes don't need to be since we never return
477         // nodes in the prefix tree to form part of the filtered range).
478         if (prefix == null || range == null) {
479           return null;
480         }
481         // If we get to the end of the prefix, just return whatever's left in the range.
482         if (prefix == Node.TERMINAL) {
483           return range;
484         }
485         // Still "in" the prefix but we hit the end of the range (e.g. "123".filter("12") == ∅
486         if (range == Node.TERMINAL) {
487           return null;
488         }
489         // Since we only recurse while still "in" the prefix we are never terminating (e.g.
490         // "123".filter({"12", "1234"}) == {"1234"} and does not contain "12").
491         return recurse(this, prefix, range, false);
492       }
493     }
494 
495     // Singleton set operations instance to handle interning of nodes.
496     static final SetOperations INSTANCE = new SetOperations();
497 
498     /**
499      * Weak-referenced interning map. This cannot be a standard Guava {@code Interner} because it
500      * must recursively intern the targets of any nodes to ensure that once a Node is interned, all
501      * the nodes reachable from it are also interned.
502      */
503     private final Map<WeakNodeRef, WeakNodeRef> interningMap = new ConcurrentHashMap<>();
504 
505     /**
506      * Referent queue onto which node references clear by GC will be put. The elements in this
507      * queue should correspond to unused entries in the map which need to be tidied up.
508      */
509     private final ReferenceQueue<Node> tidyUpQueue = new ReferenceQueue<>();
510 
511     private final LogicalOperation unionFn = new Union();
512     private final LogicalOperation intersectionFn = new Intersection();
513     private final LogicalOperation subtractionFn = new Subtraction();
514     private final LogicalOperation retainFromFn = new Filter();
515 
SetOperations()516     private SetOperations() {
517       intern(Node.TERMINAL);
518     }
519 
520     /**
521      * Interns the target of an edge (this does not make the edge itself interned, but it does
522      * allow edges to be efficiently compared via their targets). If the target of the given edge
523      * was already interned then it is just returned.
524      */
525     @SuppressWarnings("ReferenceEquality")
internTarget(Edge edge)526     private Edge internTarget(Edge edge) {
527       Node target = intern(edge.target);
528       return (target == edge.target) ? edge : new Edge(edge.mask, target);
529     }
530 
531     /**
532      * Recursively interns a node and all nodes reachable from it. Note that if the given nodes do
533      * not represent a minimal DFA, then the interning process itself won't necessarily produce a
534      * minimal result. The minimal DFA property of range trees exists by induction and assumes that
535      * all trees are constructed minimally and that logical operations produce minimal trees. Note
536      * that if necessary, the interning operation could ensure minimization but at the cost of some
537      * efficiency (you would use the EDGE_COLLECTOR to squash duplicate edges).
538      */
intern(Node node)539     private Node intern(Node node) {
540       WeakNodeRef ref = new WeakNodeRef(node, tidyUpQueue);
541       WeakNodeRef existingRef = interningMap.get(ref);
542       if (existingRef != null) {
543         // Claim strong reference once into a local variable.
544         Node interned = existingRef.get();
545         if (interned != null) {
546           // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map).
547           ref.clear();
548           return interned;
549         }
550       }
551       // In the vast majority of cases, the edges of the node being interned already reference
552       // interned targets. The returned list contains edges to (recursively) interned nodes.
553       // If the edges we get back are the edges of our node, we just need to add ourselves to the
554       // intern map. If our edges were not interned (and we aren't in the map yet) then we must
555       // make a duplicate node that has only the interned edges before adding it to the map. This
556       // preserves the property that interned nodes only ever connect to other interned nodes.
557       ImmutableList<Edge> edges =
558           node.edges.stream().map(this::internTarget).collect(toImmutableList());
559       if (!node.edges.equals(edges)) {
560         // Clear the original reference before overwriting the node to avoid it being put on the
561         // tidy-up queue (otherwise as soon as "node" is overwritten, GC could enqueue "ref").
562         ref.clear();
563         // Create a new node with interned edges and a corresponding weak reference.
564         node = new Node(edges, node.canTerminate());
565         ref = new WeakNodeRef(node, tidyUpQueue);
566       }
567       // Consider the race condition where another thread added this node in the meantime. We
568       // cannot obtain a strong reference until after the WeakNode is returned from the map, which
569       // means there's always a race condition under which the referenced node will be collected.
570       while (true) {
571         existingRef = interningMap.putIfAbsent(ref, ref);
572         if (existingRef == null) {
573           // Easy case: We succeeded in putting our new reference into the map, so return our node.
574           return node;
575         }
576         // There's still a risk that the reference became null after being found in the map.
577         Node interned = existingRef.get();
578         if (interned != null) {
579           // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map).
580           ref.clear();
581           return interned;
582         }
583         // The reference must have been garbage collected after the weak node was found. This is
584         // very rare but possible, and the only real strategy is to try again. We can't really end
585         // up in a loop here unless we're continuously garbage collecting (i.e. bigger problems).
586         // We can't find the same reference again (it was cleared and is no longer equal-to "ref").
587       }
588     }
589 
590     // Remove all WeakNodeRefs that have been cleared by the garbage collector. This should
591     // precisely account for all weak nodes in the interning map which have been cleared by the
592     // garbage collector (weak nodes that were never added to the map are cleared manually and
593     // should not appear in the tidy-up queue).
tidyUpInterningMap()594     private void tidyUpInterningMap() {
595       Reference<?> ref;
596       while ((ref = tidyUpQueue.poll()) != null) {
597         interningMap.remove(ref);
598       }
599     }
600 
601     /**
602      * Applies the given operation recursively to a pair of interned nodes. The resulting node is
603      * interned and (if the input nodes were both minimal) minimal.
604      */
605     @SuppressWarnings("ReferenceEquality")
recurse(LogicalOperation logicalOp, Node lhs, Node rhs, boolean canTerminate)606     Node recurse(LogicalOperation logicalOp, Node lhs, Node rhs, boolean canTerminate) {
607       // Stage 1: Use the jump tables of target nodes to make a lookup of input edge index and mask.
608       //
609       // Each entry in the 'inputMap' array is a coded integer containing:
610       // [ lhs edge index | rhs edge index | bitmask of edges ]
611       // [  bits 20-24    |  bits 16-20    |  bits 0-10       ]
612       //
613       // Basically the top 16 bits are the indices for the inputs to the logical operation and the
614       // lower 16 bits are the mask of edge indices to which that result will apply. Because the
615       // map is constructed to avoid any duplication of the indices (the 'inputKey') we ensure that
616       // the logical operation is applied to the minimal number of unique inputs (no duplication).
617       long lhsJumpTable = lhs.jumpTable;
618       long rhsJumpTable = rhs.jumpTable;
619       // Note: Could reuse from a  field (no longer thread safe, but that might be fine)
620       int[] inputMap = new int[10];
621       int mapSize = 0;
622       // The digit mask runs from bit-0 (1) to bit-9.
623       for (int digitMask = 1; digitMask <= (1 << 9); digitMask <<= 1) {
624         int inputKey = (int) (((lhsJumpTable & 0xF) << 4) | (rhsJumpTable & 0xF));
625         int n;
626         for (n = 0; n < mapSize; n++) {
627           if ((inputMap[n] >> 16) == inputKey) {
628             // Add this digit to an existing entry in the input map (the inputs are the same).
629             inputMap[n] |= digitMask;
630             break;
631           }
632         }
633         if (n == mapSize) {
634           // Add this digit to a new entry in the input map (and increase the map size).
635           inputMap[n] = (inputKey << 16) | digitMask;
636           mapSize++;
637         }
638         lhsJumpTable >>= 4;
639         rhsJumpTable >>= 4;
640       }
641       // Stage 2: Given a minimal set in inputs, perform the minimal number of logical operations.
642       // Note however that two operations can often return the same value (especially null or the
643       // TERMINAL node) so we have to minimize the results again and merge identical targets and
644       // masks.
645       List<Edge> out = new ArrayList<>();
646       for (int n = 0; n < mapSize; n++) {
647         int mask = inputMap[n];
648         // If lhs and rhs nodes are interned, then every target they reference is interned.
649         // We also assert that any nodes returned by logical operations are also interned.
650         Node node = logicalOp.apply(
651             lhs.targetFromJumpTableIndex((mask >> 20) & 0xF),
652             rhs.targetFromJumpTableIndex((mask >> 16) & 0xF));
653         if (node == null) {
654           continue;
655         }
656         // Mask out the upper bits that are no longer needed.
657         mask &= RangeSpecification.ALL_DIGITS_MASK;
658         // Find if the result of the logical operation matches an existing result in the edge list.
659         int idx;
660         for (idx = 0; idx < out.size(); idx++) {
661           Edge e = out.get(idx);
662           if (e.target == node) {
663             // We matched an existing result, so replace the existing entry with a new edge which
664             // points to the same target but includes the new digits that also share this result.
665             out.set(idx, e.merge(mask));
666             break;
667           }
668         }
669         if (idx == out.size()) {
670           // This is the first time this result was seen, so add it in a new entry.
671           out.add(new Edge(mask, node));
672         }
673       }
674       // Stage 3: Given a minimal list of final edges (and after checking for degenerate cases
675       // (empty or terminating nodes) create and intern a new minimal node for the edges.
676       if (out.isEmpty()) {
677         return canTerminate ? Node.TERMINAL : null;
678       } else {
679         return intern(new Node(ImmutableList.copyOf(out), canTerminate));
680       }
681     }
682 
683     /**
684      * Returns the (minimal) logical union {@code '∪'} of two (minimal) sub-trees rooted at the
685      * given nodes. The sub-trees are interned before recursively applying the "union" function.
686      */
unionImpl(Node lhs, Node rhs)687     private Node unionImpl(Node lhs, Node rhs) {
688       if (lhs == null) {
689         return rhs;
690       } else if (rhs == null) {
691         return lhs;
692       } else {
693         return unionFn.apply(intern(lhs), intern(rhs));
694       }
695     }
696 
697     /**
698      * Returns the (minimal) logical intersection {@code '∩'} of two (minimal) sub-trees rooted at
699      * the given nodes. The sub-trees are interned before recursively applying the "union" function.
700      */
intersectImpl(Node lhs, Node rhs)701     private Node intersectImpl(Node lhs, Node rhs) {
702       if (lhs == null || rhs == null) {
703         return null;
704       } else {
705         return intersectionFn.apply(intern(lhs), intern(rhs));
706       }
707     }
708 
709     /**
710      * Returns the (minimal) logical subtraction {@code '∖'} of two (minimal) sub-trees rooted at
711      * the given nodes. The sub-trees are interned before recursively applying the "union" function.
712      */
subtractImpl(Node lhs, Node rhs)713     private Node subtractImpl(Node lhs, Node rhs) {
714       if (lhs == null) {
715         return null;
716       } else if (rhs == null) {
717         return lhs;
718       } else {
719         return subtractionFn.apply(intern(lhs), intern(rhs));
720       }
721     }
722 
retainFromImpl(Node prefix, Node range)723     private Node retainFromImpl(Node prefix, Node range) {
724       if (prefix == null || range == null) {
725         return null;
726       }
727       // As this operation never returns nodes that were in the prefix tree, or tests if prefix
728       // nodes are the same as instances in the range tree, there's no need to intern it.
729       return retainFromFn.apply(prefix, intern(range));
730     }
731 
732     /** Returns the union of one or more range trees. */
union(RangeTree first, RangeTree... rest)733     private RangeTree union(RangeTree first, RangeTree... rest) {
734       Node node = first.root;
735       for (RangeTree t : rest) {
736         node = unionImpl(node, t.root);
737       }
738       tidyUpInterningMap();
739       return newOrEmptyTree(node);
740     }
741 
742     /**
743      * Returns the union of two prefix trees. For prefix trees {@code p1} and {@code p2}, the union
744      * {@code P = p1.union(p2)} is defined such that:
745      * <pre>{@code
746      *   P.filter(R) = p1.filter(R).union(p2.filter(R))
747      * }</pre>
748      * If prefixes are the same length this is equivalent to {@link RangeTree#union(RangeTree)},
749      * but when prefixes overlap, only the more general (shorter) prefix is retained.
750      */
union(PrefixTree lhs, PrefixTree rhs)751     PrefixTree union(PrefixTree lhs, PrefixTree rhs) {
752       // Using one prefix tree (A) to filter another (B), gives you the set of ranges in B which
753       // are, at least, also contained in A. The union of two prefix trees need only contain the
754       // more general (shorter) prefix and the more specific (longer) prefixes must be removed
755       // since they overlap with the more general ones.
756       //
757       // For example "12".retainFrom("1234") == "1234", but we don't want to retain "1234" in the
758       // final range tree (since it will already contain "12" anyway).
759       //
760       // If the same prefix exists in both inputs however, just doing this subtraction would remove
761       // it from the result (which is not what we want), so we also include the intersection of the
762       // prefix ranges.
763       RangeTree ltree = lhs.asRangeTree();
764       RangeTree rtree = rhs.asRangeTree();
765       return PrefixTree.from(union(
766           // Prefixes in both inputs (which would otherwise be removed by the subtractions below).
767           intersect(ltree, rtree),
768           // Prefixes in "lhs" which are strictly more general than any prefix in "rhs"
769           subtract(ltree, retainFrom(rhs, ltree)),
770           // Prefixes in "rhs" which are strictly more general than any prefix in "lhs"
771           subtract(rtree, retainFrom(lhs, rtree))));
772     }
773 
774     /** Returns the intersection of one or more range trees. */
intersect(RangeTree first, RangeTree... rest)775     private RangeTree intersect(RangeTree first, RangeTree... rest) {
776       Node node = first.root;
777       for (RangeTree t : rest) {
778         node = intersectImpl(node, t.root);
779       }
780       tidyUpInterningMap();
781       return newOrEmptyTree(node);
782     }
783 
784     /**
785      * Returns the intersection of two prefix trees. For prefix trees {@code p1} and {@code p2},
786      * the intersection {@code P = p1.intersect(p2)} is defined such that:
787      * <pre>{@code
788      *   P.filter(R) = p1.filter(R).intersect(p2.filter(R))
789      * }</pre>
790      * If prefixes are the same length this is equivalent to {@link RangeTree#intersect(RangeTree)},
791      * but when prefixes overlap, only the more specific (longer) prefix is retained.
792      */
intersect(PrefixTree lhs, PrefixTree rhs)793     PrefixTree intersect(PrefixTree lhs, PrefixTree rhs) {
794       return PrefixTree.from(union(
795           // Prefixes in "lhs" which are the same or more specific as any prefix in "rhs"
796           retainFrom(rhs, lhs.asRangeTree()),
797           // Prefixes in "rhs" which are the same or more specific as any prefix in "lhs"
798           retainFrom(lhs, rhs.asRangeTree())));
799     }
800 
801     /** Returns the difference of two range trees, {@code lhs - rhs}. */
subtract(RangeTree lhs, RangeTree rhs)802     private RangeTree subtract(RangeTree lhs, RangeTree rhs) {
803       Node node = subtractImpl(lhs.root, rhs.root);
804       tidyUpInterningMap();
805       return newOrEmptyTree(node);
806     }
807 
808     /**
809      * Returns a subset of the given ranges, containing only ranges which are prefixed by an
810      * element in the given prefix tree. For example:
811      * <pre> {@code
812      *   RangeTree r = { "12xx", "1234x" }
813      *   PrefixTree p = { "12[0-5]" }
814      *   retainFrom(p, r) = { "12[0-5]x", "1234x"}
815      * }</pre>
816      * Note that if the prefix tree is empty, this method returns the empty range tree.
817      */
retainFrom(PrefixTree prefixes, RangeTree ranges)818     RangeTree retainFrom(PrefixTree prefixes, RangeTree ranges) {
819       Node node = retainFromImpl(prefixes.asRangeTree().root, ranges.root);
820       tidyUpInterningMap();
821       return newOrEmptyTree(node);
822     }
823   }
824 
825   /**
826    * Returns a minimal range tree for the given specification. The tree has only one path and only
827    * matches digit sequences of the same length as the given specification.
828    */
from(RangeSpecification s)829   public static RangeTree from(RangeSpecification s) {
830     Node node = Node.TERMINAL;
831     for (int n = s.length() - 1; n >= 0; n--) {
832       node = new Node(ImmutableList.of(new Edge(s.getBitmask(n), node)), false);
833     }
834     return newOrEmptyTree(node);
835   }
836 
837   /**
838    * Returns a minimal range tree for the given specifications. This tree is formed as the logical
839    * union of the trees for all given specifications.
840    */
from(Iterable<RangeSpecification> specs)841   public static RangeTree from(Iterable<RangeSpecification> specs) {
842     SetOperations setOps = SetOperations.INSTANCE;
843     Node node = null;
844     for (RangeSpecification s : specs) {
845       node = setOps.unionImpl(node, from(s).root);
846     }
847     setOps.tidyUpInterningMap();
848     return newOrEmptyTree(node);
849   }
850 
851   /**
852    * Returns a minimal range tree for the given specifications. This tree is formed as the logical
853    * union of the trees for all given specifications.
854    */
from(Stream<RangeSpecification> specs)855   public static RangeTree from(Stream<RangeSpecification> specs) {
856     return from(specs.collect(toImmutableList()));
857   }
858 
859   /**
860    * Returns a minimal range tree for the given digit sequence ranges. This tree is formed as the
861    * logical union of all the range specifications derived from the given ranges.
862    */
from(RangeSet<DigitSequence> ranges)863   public static RangeTree from(RangeSet<DigitSequence> ranges) {
864     // Currently we don't accept an empty range set in RangeSpecification.from().
865     return !ranges.isEmpty() ? from(RangeSpecification.from(ranges)) : RangeTree.empty();
866   }
867 
868   /**
869    * Returns a range tree whose root is the given DfaNode. The given node must have been found by
870    * visiting an existing range tree. This method is useful for efficiently implementing "sub tree"
871    * logic in some cases.
872    *
873    * @throws IllegalArgumentException if the given node did not come from a valid range tree.
874    */
875   @SuppressWarnings("ReferenceEquality")
from(DfaNode root)876   public static RangeTree from(DfaNode root) {
877     checkNotNull(root, "root node cannot be null");
878     checkArgument(root instanceof Node,
879         "invalid root node (wrong type='%s'): %s", root.getClass(), root);
880     Node node = (Node) root;
881     // Reference equality is correct since this is testing for interning.
882     checkArgument(node == SetOperations.INSTANCE.intern(node),
883         "invalid root node (not from valid RangeTree): %s", node);
884     return new RangeTree(node);
885   }
886 
887   private static final RangeTree EMPTY = new RangeTree();
888 
889   /** Returns the enpty range tree, which matches only the empty digit sequence. */
empty()890   public static RangeTree empty() {
891     return EMPTY;
892   }
893 
newOrEmptyTree(Node node)894   private static RangeTree newOrEmptyTree(Node node) {
895     return node != null ? new RangeTree(node) : EMPTY;
896   }
897 
898   /**
899    * The root node, possibly null to signify the "empty" tree which matches no possible digit
900    * sequences (this is distinct from a tree that matches only the empty digit sequence).
901    */
902   private final Node root;
903   private final long matchCount;
904   private final Supplier<ImmutableSortedSet<Integer>> lengths;
905   // Cached on demand.
906   private Integer hashCode = null;
907 
908   /** Constructor for the singleton empty tree. */
RangeTree()909   private RangeTree() {
910     this.root = null;
911     // Unlike the terminal node (which matches the empty sequence), the empty tree matches nothing.
912     this.matchCount = 0L;
913     this.lengths = Suppliers.ofInstance(ImmutableSortedSet.of());
914   }
915 
916   /** Constructor for a non-empty tree. */
RangeTree(Node root)917   private RangeTree(Node root) {
918     this.root = Preconditions.checkNotNull(root);
919     this.matchCount = root.matchCount;
920     this.lengths = Suppliers.memoize(() -> calculateLengths(root));
921   }
922 
923   /**
924    * Returns whether this range tree accepts any input sequences. Note that in theory a range tree
925    * could accept the empty digit sequence (but in that case it would not be empty). An empty range
926    * tree cannot match any possible sequence.
927    */
isEmpty()928   public boolean isEmpty() {
929     return root == null;
930   }
931 
calculateLengths(Node root)932   private static ImmutableSortedSet<Integer> calculateLengths(Node root) {
933     // Length mask cannot be 0 as it must match (at least) sequences of length 0.
934     int lengthMask = root.lengthMask;
935     ImmutableSortedSet.Builder<Integer> lengths = ImmutableSortedSet.naturalOrder();
936     do {
937       int length = numberOfTrailingZeros(lengthMask);
938       lengths.add(length);
939       // Clear each bit as we go.
940       lengthMask &= ~(1 << length);
941     } while (lengthMask != 0);
942     return lengths.build();
943   }
944 
945   /** Returns the set of digit sequence lengths which could be matched by this range tree. */
getLengths()946   public ImmutableSortedSet<Integer> getLengths() {
947     return lengths.get();
948   }
949 
950   /**
951    * Returns the smallest digit sequence which will be accepted by this range tree, in
952    * {@link DigitSequence} order. Note that this is not the same as calling {@code sample(0)},
953    * since {@link #sample(long)} does not use {@code DigitSequence} order.
954    *
955    * @return the smallest digit sequence accepted by this tree.
956    * @throws IllegalStateException if the tree is empty.
957    */
first()958   public DigitSequence first() {
959     checkState(!isEmpty(), "cannot get minimum sequence for an empty range tree");
960     DigitSequence first = DigitSequence.empty();
961     Node node = root;
962     int minLength = Integer.numberOfTrailingZeros(root.lengthMask);
963     if (minLength > 0) {
964       // Length mask is the mask for checking against the target node's length(s), so we pre-shift
965       // it by one (i.e. not "1 << minLength"). This is also why there needs to be a zero check
966       // around this loop since otherwise we would not correctly detect when the empty sequence was
967       // in the tree (we don't check the root node in this loop).
968       for (int lengthMask = 1 << (minLength - 1); lengthMask > 0; lengthMask >>>= 1) {
969         for (Edge e : node.edges) {
970           // Exit when we find the first edge for which the minimum length path can be reached.
971           // This is only possible for first() because edges are ordered by their minimum digit
972           // (you could not use a similar trick to implement a last() method). This break must be
973           // reached since at least once edge _must_ have the expected length bit set.
974           if ((e.target.lengthMask & lengthMask) != 0) {
975             first = first.extendBy(DigitSequence.singleton(Integer.numberOfTrailingZeros(e.mask)));
976             node = e.target;
977             break;
978           }
979         }
980       }
981     }
982     return first;
983   }
984 
985   /**
986    * Returns a digit sequence in the range tree for a given sampling index (in the range
987    * {@code 0 <= index < size()}). Note that this method makes no promises about the specific
988    * ordering used since it is dependant on the internal tree structure.
989    *
990    * <p>However the mapping from index to sequence is guaranteed to be a bijection, so while it is
991    * not true that {@code sample(n).next().equals(sample(n+1))}, it is true that
992    * {@code sample(n).equals(sample(m))} if-and-only-if {@code n == m}. Thus a pseudo random sample
993    * of N distinct indices will result in N distinct sequences.
994    *
995    * <p>This method is not recommended for general iteration over a tree, since there can be
996    * trillions of digit sequences.
997    *
998    * @throws ArrayIndexOutOfBoundsException if the index is invalid.
999    */
sample(long index)1000   public DigitSequence sample(long index) {
1001     if (index < 0 || index >= size()) {
1002       throw new IndexOutOfBoundsException(
1003           String.format("index (%d) out of bounds [0...%d]", index, size()));
1004     }
1005     return recursiveGet(root, index);
1006   }
1007 
1008   @SuppressWarnings("ReferenceEquality")  // Nodes are interned.
recursiveGet(Node node, long index)1009   private static DigitSequence recursiveGet(Node node, long index) {
1010     // We can assert that 0 <= index < node.matchCount by inspection (checked initially and true
1011     // by code inspection below).
1012     if (node.canTerminate()) {
1013       // Every recursion should end here (since at some point we traverse the final edge where the
1014       // index has been reduced to zero). However we also get here while still in the tree and must
1015       // decide whether to terminate if we see a terminating node.
1016       if (index == 0) {
1017         return DigitSequence.empty();
1018       }
1019       // Subtract 1 to account for this early terminating digit sequence (it is reflected in the
1020       // match count so we must adjust our index before moving on).
1021       index -= 1;
1022     }
1023     // Should always have at least one out edge here so the mask isn't empty.
1024     checkState(node != Node.TERMINAL, "!!! Bad RangeTree !!!");
1025     for (Edge e : node.edges) {
1026       long weightedCount = e.target.matchCount * Integer.bitCount(e.mask);
1027       if (index >= weightedCount) {
1028         // We are not following this edge, so adjust index and continue.
1029         index -= weightedCount;
1030         continue;
1031       }
1032       // Find which digit of the edge we are traversing. If we are in the Nth copy of the match
1033       // count we want the digit corresponding to the Nth bit in the edge mask. Achieve this by
1034       // repeatedly removing the lowest set bit each time around the loop (getting the lowest set
1035       // bit as a mask is way faster than getting it's bit position).
1036       int mask = e.mask;
1037       while (index >= e.target.matchCount) {
1038         index -= e.target.matchCount;
1039         mask &= ~Integer.lowestOneBit(mask);
1040       }
1041       return DigitSequence.singleton(Integer.numberOfTrailingZeros(mask))
1042           .extendBy(recursiveGet(e.target, index));
1043     }
1044     // Should be impossible since we should always find an edge for the current index. If we get
1045     // here something is very messed up with either this code or the internal data structure.
1046     throw new IllegalStateException("!!! Bad RangeTree !!!");
1047   }
1048 
1049   /** Returns the number of unique digit sequences contained in this range tree. */
size()1050   public long size() {
1051     return matchCount;
1052   }
1053 
1054   // -------- Set-like operations --------
1055 
1056   /** Returns the minimal logical union of this instance and the given tree. */
union(RangeTree tree)1057   public RangeTree union(RangeTree tree) {
1058     return SetOperations.INSTANCE.union(this, tree);
1059   }
1060 
1061   /** Returns the minimal logical intersection of this instance and the given tree. */
intersect(RangeTree tree)1062   public RangeTree intersect(RangeTree tree) {
1063     return SetOperations.INSTANCE.intersect(this, tree);
1064   }
1065 
1066   /** Returns the minimal logical subtraction of the given tree from this instance. */
subtract(RangeTree tree)1067   public RangeTree subtract(RangeTree tree) {
1068     return SetOperations.INSTANCE.subtract(this, tree);
1069   }
1070 
1071   /** Returns whether a given digit sequence is in the set of sequences defined by this tree. */
contains(DigitSequence digits)1072   public boolean contains(DigitSequence digits) {
1073     Node node = root;
1074     if (node == null) {
1075       return false;
1076     }
1077     for (int n = 0; n < digits.length(); n++) {
1078       node = node.findTarget(digits.getDigit(n));
1079       if (node == null) {
1080         return false;
1081       }
1082     }
1083     return node.canTerminate();
1084   }
1085 
1086   /**
1087    * Returns true if the given tree is a subset of this instance. This is functionally equivalent
1088    * to {@code tree.subtract(this).isEmpty()}, but much more efficient in cases where it returns
1089    * false.
1090    */
containsAll(RangeTree tree)1091   public boolean containsAll(RangeTree tree) {
1092     if (tree.isEmpty()) {
1093       // Everything contains all the contents of the empty set (even the empty set).
1094       return true;
1095     }
1096     if (isEmpty()) {
1097       // Nothing is contained by the empty set.
1098       return false;
1099     }
1100     ContainsAllVisitor v = new ContainsAllVisitor(getInitial());
1101     tree.getInitial().accept(v);
1102     return v.containsAll;
1103   }
1104 
1105   // A very efficient test of tree containment (faster than doing "b.subtract(a).isEmpty()").
1106   private static final class ContainsAllVisitor implements DfaVisitor {
1107     private boolean containsAll = true;
1108     private DfaNode current;
ContainsAllVisitor(DfaNode node)1109     private ContainsAllVisitor(DfaNode node) {
1110       current = node;
1111     }
1112 
1113     @SuppressWarnings("ReferenceEquality")
1114     @Override
visit(DfaNode source, DfaEdge edge, DfaNode target)1115     public void visit(DfaNode source, DfaEdge edge, DfaNode target) {
1116       // Since nodes are interned once in a tree, '==' is sufficient and not potentially slow.
1117       if (current == source || !containsAll) {
1118         // An identical subtree means we can shortcut everything (also if we know we've failed).
1119         return;
1120       }
1121       // No containment if the "subset" tree has lengths not in the current tree.
1122       // This also effectively checks for termination at this node (via bit-0).
1123       if ((~current.getLengthMask() & source.getLengthMask()) != 0) {
1124         containsAll = false;
1125         return;
1126       }
1127       // Recursively check that the sub-tree of the target node is contained within one or more
1128       // edges of the current tree.
1129       int subMask = edge.getDigitMask();
1130       for (DfaEdge e : current.getEdges()) {
1131         // Look at paths which are in both trees.
1132         int m = (e.getDigitMask() & subMask);
1133         if (m != 0) {
1134           DfaNode oldCurrent = current;
1135           current = e.getTarget();
1136           edge.getTarget().accept(this);
1137           current = oldCurrent;
1138           if (!containsAll) {
1139             // Containment failure in some sub-tree.
1140             return;
1141           }
1142           // Clear bits we're accounted for.
1143           subMask &= ~m;
1144         }
1145       }
1146       // If not all edges were accounted for, this was not a valid sub-tree.
1147       if (subMask != 0) {
1148         containsAll = false;
1149       }
1150     }
1151   }
1152 
1153   // -------- Non set-like operations (transforming a RangeTree in non set-like ways) --------
1154 
1155   /** A general mapping function for transforming a range tree via its specifications. */
map(Function<RangeSpecification, RangeSpecification> fn)1156   public RangeTree map(Function<RangeSpecification, RangeSpecification> fn) {
1157     return from(asRangeSpecifications().stream().map(fn));
1158   }
1159 
1160   /**
1161    * Returns a range tree which matches the same digit sequences as this instance down to the first
1162    * {@code n} digits. The returned tree is a super-set of this instance.
1163    */
significantDigits(int n)1164   public RangeTree significantDigits(int n) {
1165     checkArgument(n >= 0, "invalid significant digits");
1166     return map(s -> s.first(n).extendByLength(Math.max(s.length() - n, 0)));
1167   }
1168 
1169   /** Returns a range tree with the given "path" prefixed to the front. */
prefixWith(RangeSpecification prefix)1170   public RangeTree prefixWith(RangeSpecification prefix) {
1171     checkArgument(isEmpty() || getLengths().last() + prefix.length() <= DigitSequence.MAX_DIGITS,
1172         "cannot extend range tree (prefix '%s' too long): %s", prefix, this);
1173     return (prefix.length() > 0) ? map(prefix::extendBy) : this;
1174   }
1175 
1176   /**
1177    * Slices a range tree at a single length. This is equivalent to {@code slice(length, length)}.
1178    */
slice(int length)1179   public RangeTree slice(int length) {
1180     return slice(length, length);
1181   }
1182 
1183   /**
1184    * Slices a range tree within the specified length bounds. A path exists in the returned tree if
1185    * it's length is in the (inclusive) range {@code [minLength, maxLength]} or it was longer than
1186    * {@code maxLength} but has been truncated. Importantly the returned range tree is not a subset
1187    * of the original tree.
1188    *
1189    * <p>This method can be thought of as returning the "complete or partially complete digit
1190    * sequences up to the specified maximum length". It is useful for calculating prefixes which
1191    * match partial digit sequences (i.e. "as you type" formatting).
1192    *
1193    * <p>For example:
1194    * <pre> {@code
1195    * slice({ 12345, 67xxx, 89 }, 0, 3) == { 123, 67x, 89 }
1196    * slice({ 12345, 67xxx, 89 }, 3, 3) == { 123, 67x }
1197    * slice({ 12, 34, 5 }, 2, 3) == { 12, 34 }
1198    * slice({ 12, 34 }, 3, 3) == { }
1199    * }</pre>
1200    */
slice(int minLength, int maxLength)1201   public RangeTree slice(int minLength, int maxLength) {
1202     return from(
1203         asRangeSpecifications().stream()
1204             .filter(s -> s.length() >= minLength)
1205             .map(s -> s.first(maxLength)));
1206   }
1207 
1208   // -------- Transformation APIs (converting a RangeTree to another representation) --------
1209 
1210   /** Returns the minimal, ordered list of range specifications represented by this tree. */
asRangeSpecifications()1211   public ImmutableList<RangeSpecification> asRangeSpecifications() {
1212     if (root == null) {
1213       return ImmutableList.of();
1214     }
1215     List<RangeSpecification> out = new ArrayList<>();
1216     int lenMask = root.lengthMask;
1217 
1218     if ((lenMask & (lenMask - 1)) == 0) {
1219       // If this tree only matches one length of sequences, we can just serialize it directly.
1220       addSpecs(this.root, RangeSpecification.empty(), out);
1221     } else {
1222       // When a tree matches more than one length, we cannot just serialize it in one go, because
1223       // the tree for ["123", "####"] would serialize as:
1224       //   ["123", "[02-9]###", "1[013-9]##", "12[0-24-9]#", "123#"]
1225       // and while the union of those 4-digit sequences is the same as "####", it's hardly minimal
1226       // or user friendly. In order to get a minimal serialization for a given length N, it is
1227       // sufficient to intersect the tree with the "allMatch" tree of length N (e.g. "####...")
1228       // and then serialize the result.
1229       SetOperations setOps = SetOperations.INSTANCE;
1230       for (Integer length : getLengths()) {
1231         // This can't be empty because we know there's at least one branch that matches digits
1232         // of the current length (or we would not have returned the length from getLengths()).
1233         addSpecs(
1234             setOps.intersectImpl(this.root, allMatch(length)), RangeSpecification.empty(), out);
1235       }
1236       setOps.tidyUpInterningMap();
1237     }
1238     return ImmutableList.sortedCopyOf(Comparator.comparing(RangeSpecification::min), out);
1239   }
1240 
1241   /**
1242    * Recursively adds the range specifications generated from the given sub-tree to the output
1243    * list.
1244    */
addSpecs(Node node, RangeSpecification spec, List<RangeSpecification> out)1245   private static void addSpecs(Node node, RangeSpecification spec, List<RangeSpecification> out) {
1246     if (node.canTerminate()) {
1247       out.add(spec);
1248     }
1249     for (Edge e : node.edges) {
1250       addSpecs(e.target, spec.extendByMask(e.mask), out);
1251     }
1252   }
1253 
1254   /** Returns a node which accepts any digit sequences of the given length. */
allMatch(int length)1255   private static Node allMatch(int length) {
1256     Node node = Node.TERMINAL;
1257     for (int n = 0; n < length; n++) {
1258       node = new Node(ImmutableList.of(new Edge(ALL_DIGITS_MASK, node)), false);
1259     }
1260     return node;
1261   }
1262 
1263   /** Returns the minimal covering of ranges specified by this tree. */
asRangeSet()1264   public ImmutableRangeSet<DigitSequence> asRangeSet() {
1265     ImmutableRangeSet.Builder<DigitSequence> out = ImmutableRangeSet.builder();
1266     // Not all ranges create different range specifications are disjoint and this will merge then
1267     // into then minimal set.
1268     for (RangeSpecification s : asRangeSpecifications()) {
1269       out.addAll(s.asRanges());
1270     }
1271     return out.build();
1272   }
1273 
1274   // -------- DFA visitor API --------
1275 
1276   /**
1277    * Accepts the given visitor on the root of this non-empty tree.
1278    *
1279    * @throws IllegalStateException if the tree is empty.
1280    */
accept(DfaVisitor visitor)1281   public void accept(DfaVisitor visitor) {
1282     checkState(root != null, "cannot accept a visitor on an empty range tree");
1283     root.accept(visitor);
1284   }
1285 
1286   /**
1287    * Returns the initial node of this non-empty tree.
1288    *
1289    * @throws IllegalStateException if the tree is empty.
1290    */
getInitial()1291   public DfaNode getInitial() {
1292     checkState(root != null, "cannot get the initial node from an empty range tree");
1293     return root;
1294   }
1295 
1296   /** Returns the singleton terminal node of any range tree. */
getTerminal()1297   public static DfaNode getTerminal() {
1298     return Node.TERMINAL;
1299   }
1300 
1301   // -------- Miscellaneous Object APIs --------
1302 
1303   @SuppressWarnings("ReferenceEquality")
1304   @Override
equals(Object o)1305   public boolean equals(Object o) {
1306     // This could also just convert both to range specifications and use that, but that's likely
1307     // a lot more work (not that this is trivial). If you really want fast equality of trees then
1308     // the interning map needs to be global (but also switched to a weak map).
1309     if (this == o) {
1310       return true;
1311     }
1312     if (!(o instanceof RangeTree)) {
1313       return false;
1314     }
1315     RangeTree other = (RangeTree) o;
1316     if (root == null && other.root == null) {
1317       // Empty trees are equal.
1318       return true;
1319     }
1320     if (root == null || other.root == null) {
1321       // An empty tree is never equal to a non-empty tree.
1322       return false;
1323     }
1324     // Intern both trees and see if their roots are now identical.
1325     SetOperations setOps = SetOperations.INSTANCE;
1326     return setOps.intern(root) == setOps.intern(((RangeTree) o).root);
1327   }
1328 
1329   @Override
hashCode()1330   public int hashCode() {
1331     if (hashCode == null) {
1332       hashCode = asRangeSpecifications().hashCode();
1333     }
1334     return hashCode;
1335   }
1336 
1337   /** Debugging only. */
1338   @Override
toString()1339   public String toString() {
1340     return asRangeSpecifications().toString();
1341   }
1342 }
1343