1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.i18n.phonenumbers.metadata; 17 18 import static com.google.common.base.Preconditions.checkArgument; 19 import static com.google.common.base.Preconditions.checkNotNull; 20 import static com.google.common.base.Preconditions.checkState; 21 import static com.google.common.collect.ImmutableList.toImmutableList; 22 import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; 23 import static java.lang.Integer.numberOfTrailingZeros; 24 25 import com.google.common.base.Preconditions; 26 import com.google.common.base.Supplier; 27 import com.google.common.base.Suppliers; 28 import com.google.common.collect.ImmutableList; 29 import com.google.common.collect.ImmutableRangeSet; 30 import com.google.common.collect.ImmutableSortedSet; 31 import com.google.common.collect.RangeSet; 32 import java.lang.ref.Reference; 33 import java.lang.ref.ReferenceQueue; 34 import java.lang.ref.WeakReference; 35 import java.util.ArrayList; 36 import java.util.Comparator; 37 import java.util.List; 38 import java.util.Map; 39 import java.util.concurrent.ConcurrentHashMap; 40 import java.util.function.BiFunction; 41 import java.util.function.Function; 42 import java.util.stream.Stream; 43 import javax.annotation.Nullable; 44 45 /** 46 * Minimal decision tree for matching digit sequences. A range tree represents an arbitrary set of 47 * digit sequences typically grouped as a set of disjoint ranges. A range tree can be thought of 48 * as equivalent to either {@code RangeSet<DigitSequence>} or a canonical 49 * {@code List<RangeSpecification>}. Range trees can have set-like operations performed on them, 50 * such as union and intersection. 51 */ 52 public final class RangeTree { 53 /** 54 * Simple API for representing nodes in the DFA during visitation. See also {@link DfaVisitor} 55 * and {@link #accept(DfaVisitor)}. 56 */ 57 public interface DfaNode { 58 /** Returns whether this node can terminate. */ canTerminate()59 boolean canTerminate(); 60 61 /** Accepts the given visitor on this node, visiting its immediate outgoing (child) edges. */ accept(DfaVisitor visitor)62 void accept(DfaVisitor visitor); 63 64 /** Finds the outgoing edge from this node which contains the given digit. */ 65 @Nullable find(int digit)66 DfaEdge find(int digit); 67 68 /** Returns the list of edges leading out from this node.*/ getEdges()69 List<DfaEdge> getEdges(); 70 71 /** 72 * Returns a bit-mask of possible lengths from this node. Bit-N is set of the sub-tree rooted 73 * at this node can terminate (in any branch) at depth N. A corollary to this is that bit-0 is 74 * set if this node can terminate. 75 */ getLengthMask()76 int getLengthMask(); 77 } 78 79 /** 80 * Simple API for representing edges in the DFA during visitation. See also {@link DfaVisitor} 81 * and {@link #accept(DfaVisitor)}. 82 */ 83 public interface DfaEdge { 84 /** Returns a bit-mask of the digits accepted by this edge. */ getDigitMask()85 int getDigitMask(); 86 87 /** Returns the target node of this edge. */ getTarget()88 DfaNode getTarget(); 89 } 90 91 /** 92 * Visitor API for traversing edges in {@code RangeTrees}. When a node accepts a visitor, it 93 * visits only the immediate outgoing edges of that node. If recursive visitation is required, it 94 * is up to the visitor to call {@link DfaNode#accept(DfaVisitor)} during visitation. 95 * 96 * <p>Graph nodes and edges obey {@code Object} equality and this can be used by visitor 97 * implementations to track which nodes have been reached if they only wish to visit each edge 98 * once (e.g. storing visited nodes in a set or map). 99 */ 100 public interface DfaVisitor { 101 /** 102 * Visits an edge in the DFA graph of a {@code RangeTree} as the result of a call to 103 * {@link DfaNode#accept(DfaVisitor)} on the source node. 104 */ visit(DfaNode source, DfaEdge edge, DfaNode target)105 void visit(DfaNode source, DfaEdge edge, DfaNode target); 106 } 107 108 /** 109 * A single node within a range tree. Nodes are really just a specialized implementation of a 110 * node in a deterministic finite state automaton (DFA), and {@link RangeTree} instances are just 111 * wrappers around a single "root" node. 112 * <p> 113 * Nodes have outgoing {@link Edge}s but may optionally allow termination of matching operations 114 * when they are reached (this is the same as in DFAs). Unlike DFAs however, the out-edges of a 115 * node are grouped according to a mask of all digits which can reach the same target node. For 116 * node {@code A}, which can reach a target node {@code B} via digits {@code {1, 2, 3, 7, 9 }}, 117 * there is a single edge labeled with the bitmask {@code 0x287} (binary {@code 1010001110}) as 118 * opposed to 5 separate edges, each marked with a single digit. 119 * <p> 120 * This approach more closely matches the data representations in classes like {@link 121 * RangeSpecification} and affords additional efficiencies compared to the {@code JAutomata} 122 * library, which has performance issues when processing large trees. 123 */ 124 private static final class Node implements DfaNode { 125 /** The unique "terminal" node which must be the final node in all paths in all RangeTrees. */ 126 private static final Node TERMINAL = new Node(); 127 128 /** 129 * The list of edges, ordered by the lowest bit in each mask. The masks for each edge must be 130 * mutually disjoint. Only the terminal node is permitted to have zero outbound edges. 131 */ 132 private final ImmutableList<Edge> edges; 133 134 /** 135 * Derived bit-packed table of digit-number to edge-index. The edge index for digit {@code 'd'} 136 * is stored in 4-bits, starting at bit {@code 4 * d}. This is useful as it makes finding an 137 * outbound edge a constant time operation (instead of having to search through the list of 138 * edges). 139 */ 140 private final long jumpTable; 141 142 /** A cached value of the total number of unique digits sequences matched by this DFA. */ 143 private final long matchCount; 144 145 /** 146 * Mask of all possible lengths of digit sequences this node can match. If bit-N is set then 147 * this node can match at least one input sequence of length N. Note that this includes bit-0, 148 * which is matched if the node itself can terminate. It is even possible to have a range tree 149 * containing only the terminal node which matches only the empty digit sequence (this is 150 * distinct from the "empty" tree which matches no sequences at all). 151 */ 152 private final int lengthMask; 153 154 /** Nodes are used a keys during graph interning so we cache the hashcode. */ 155 private final int hashcode; 156 157 // Only for the terminal node. Node()158 private Node() { 159 this.edges = ImmutableList.of(); 160 this.jumpTable = -1L; 161 // Unlike the empty tree, the terminal node matches one input sequence, the empty sequence. 162 // The empty tree on the other hand doesn't even reference a terminal node, so there is no 163 // possible sequence it can match. 164 this.matchCount = 1L; 165 this.lengthMask = 1; 166 this.hashcode = -1; 167 } 168 169 // A node is defined entirely from its set of edges and whether it can terminate. Node(ImmutableList<Edge> edges, boolean canTerminate)170 private Node(ImmutableList<Edge> edges, boolean canTerminate) { 171 checkArgument(!edges.isEmpty()); 172 this.edges = edges; 173 // Everything below here is derived information from the edges and termination flag. 174 int lengthMask = 0; 175 // Set all bits in the jump table (so each 4-bit entry is '-1' unless otherwise overwritten). 176 long jumpTable = -1L; 177 int outMask = 0; 178 int lastLowBit = -1; 179 // If we can terminate we get an additional match count for the sequence that reaches us, but 180 // we may get more for longer sequences matched by nodes we link to. 181 long matchCount = canTerminate ? 1 : 0; 182 for (int n = 0; n < edges.size(); n++) { 183 Edge e = edges.get(n); 184 // Make sure edges are disjoint (edges masks are already known to be valid individually). 185 checkArgument((outMask & e.mask) == 0, "edge masks not disjoint: %s", e); 186 outMask |= e.mask; 187 // Make sure edges are ordered as expected. 188 int lowBit = numberOfTrailingZeros(e.mask); 189 checkArgument(lowBit > lastLowBit, "edge masks not ordered: %s", e); 190 lastLowBit = lowBit; 191 // Work out what the match count is based on the counts of everything we link to (the sum 192 // of all the counts of our target nodes weighted by how many times we link to them). 193 matchCount += Integer.bitCount(e.mask) * e.target.matchCount; 194 // Build up a mask of all the lengths that our target node can contain. 195 lengthMask |= e.target.lengthMask; 196 // For each bit in the edge mask, set the 4-bit nibble in the jump table to the edge index. 197 for (int d = 0; d <= 9; d++) { 198 if ((e.mask & (1 << d)) != 0) { 199 // We rely on the jump table entry having all its bits set here (true from above). 200 // n = 1010 (9). 201 // (n ^ 1111) << (4d) == 0000.0101.0000... 202 // xxxx.1111.yyyy... ^ 0000.0101.0000... == xxxx.1010.yyyy 203 jumpTable ^= (n ^ 0xFL) << (4 * d); 204 } 205 } 206 } 207 this.jumpTable = jumpTable; 208 this.matchCount = matchCount; 209 // Our length set is one more than all our targets (including bit zero if we can terminate). 210 this.lengthMask = (lengthMask << 1) | (canTerminate ? 1 : 0); 211 // Caching the hashcode makes interning faster (note that this is not recursive because the 212 // hashcode of an Edge relies on the identity hashcode of the target nodes). 213 this.hashcode = edges.hashCode() ^ Boolean.hashCode(canTerminate); 214 } 215 216 /** 217 * Returns the target node for the given input digit, or {@code null} if there is no out-edge 218 * for that digit. 219 */ findTarget(int digit)220 Node findTarget(int digit) { 221 checkArgument(0 <= digit && digit <= 9); 222 return targetFromJumpTableIndex((int) (jumpTable >>> (4 * digit)) & 0xF); 223 } 224 225 /** Helper to get the target node from an edge index (rather than a digit value). */ targetFromJumpTableIndex(int n)226 private Node targetFromJumpTableIndex(int n) { 227 return (n != 0xF) ? edges.get(n).target : null; 228 } 229 230 @Nullable 231 @Override find(int digit)232 public DfaEdge find(int digit) { 233 checkArgument(0 <= digit && digit <= 9); 234 int jumpTableIndex = (int) (jumpTable >>> (4 * digit)) & 0xF; 235 return (jumpTableIndex != 0xF) ? edges.get(jumpTableIndex) : null; 236 } 237 238 @Override canTerminate()239 public boolean canTerminate() { 240 return (lengthMask & 1) != 0; 241 } 242 243 @Override accept(DfaVisitor visitor)244 public void accept(DfaVisitor visitor) { 245 for (Edge e : edges) { 246 visitor.visit(this, e, e.target); 247 } 248 } 249 250 @Override getEdges()251 public List<DfaEdge> getEdges() { 252 // NOTE: This DOES NOT make a copy (or any allocations), since ImmutableList is clever 253 // enough to know that a list of <Bar extends Foo> is also a List<Foo> if they are 254 // unmodifiable. It's a clever cast essentially! 255 return ImmutableList.copyOf(edges); 256 } 257 258 @Override getLengthMask()259 public int getLengthMask() { 260 return lengthMask; 261 } 262 263 /** 264 * Returns whether this node is interchangeable with the given instance. Equality of 265 * {@code Node} instances is not "deep" equality and is carefully designed to make constructing 266 * minimal range trees easier. Two nodes are equal if they have the same set of edges and 267 * termination flag; however edges are equal only if they point to exactly the same target 268 * instances. This is carefully designed to make "interning" work efficiently and avoid 269 * unwanted recursion during various operations. 270 */ 271 @Override equals(Object o)272 public boolean equals(Object o) { 273 if (this == o) { 274 return true; 275 } 276 if (!(o instanceof Node)) { 277 return false; 278 } 279 Node tree = (Node) o; 280 return edges.equals(tree.edges) && canTerminate() == tree.canTerminate(); 281 } 282 283 @Override hashCode()284 public int hashCode() { 285 return hashcode; 286 } 287 } 288 289 /** 290 * A directed edge to a target {@link Node}. Note that edge equality is based on instance identity 291 * as part of the interning semantics of range trees. 292 */ 293 private static final class Edge implements DfaEdge { 294 /** Bit mask of digit values this edge accepts (bit-N set implies this edge accepts digit N). */ 295 private final int mask; 296 /** Target node (or node function) this edge points at. */ 297 private final Node target; 298 Edge(int mask, Node target)299 private Edge(int mask, Node target) { 300 checkArgument(mask > 0 && mask <= RangeSpecification.ALL_DIGITS_MASK); 301 this.mask = mask; 302 this.target = checkNotNull(target); 303 } 304 305 /** Returns a new edge with the same target whose mask is OR'ed with the given value. */ merge(int m)306 Edge merge(int m) { 307 return new Edge(mask | m, target); 308 } 309 310 @Override getDigitMask()311 public int getDigitMask() { 312 return mask; 313 } 314 315 @Override getTarget()316 public DfaNode getTarget() { 317 return target; 318 } 319 320 /** 321 * Edges are equal only if the point to exactly the same targets. This is important to avoid 322 * expensive recursive equality checking during common operations. 323 */ 324 @Override 325 @SuppressWarnings("ReferenceEquality") equals(Object o)326 public boolean equals(Object o) { 327 if (!(o instanceof Edge)) { 328 return false; 329 } 330 Edge other = (Edge) o; 331 return mask == other.mask && target == other.target; 332 } 333 334 @Override hashCode()335 public int hashCode() { 336 return mask ^ System.identityHashCode(target); 337 } 338 339 /** The natural string representation of an edge is the set of digits it accepts. */ 340 @Override toString()341 public String toString() { 342 return RangeSpecification.toString(mask); 343 } 344 } 345 346 /** 347 * Implementation of set-like operations for range trees. As well as having well defined set-like 348 * operations (union, intersection etc...), range trees are also minimal DFAs, ensuring that no 349 * duplication of sub-trees occurs. This class implements set-like operations efficiently using 350 * recursive interning of nodes and always produces minimal results by construction. This 351 * approach is similar to (but not the same as) dynamic programming. 352 * <p> 353 * Note that the terms "interning" and "minimizing" of range trees are related but not the same. 354 * Interning only makes sense in relation to some sequence of operations (and is a pure 355 * implementation detail for efficiency reasons). A minimal sub-tree exists outside of logical 356 * operations and may be the result of a logical operation, or something else. Minimization of 357 * the DFA represented by a range tree is a user concept. 358 * <p> 359 * Currently all logical operations produce minimal DFAs, but a minimal DFA can come from other 360 * sources (the DFA generated for a single range specification is minimal by construction). 361 */ 362 public static final class SetOperations { 363 // A weak reference wrapper around a Node allowing it to be garbage collected when not needed. 364 private static final class WeakNodeRef extends WeakReference<Node> { 365 // This MUST be cached since it cannot be calculated once the Node is garbage collected, and 366 // it's required that keys in maps do not change their hashcode. 367 private final int hashCode; 368 WeakNodeRef(Node referent, ReferenceQueue<? super Node> q)369 public WeakNodeRef(Node referent, ReferenceQueue<? super Node> q) { 370 super(checkNotNull(referent), q); 371 this.hashCode = referent.hashCode(); 372 } 373 374 @Override hashCode()375 public int hashCode() { 376 return hashCode; 377 } 378 379 /* 380 * This is very subtle. To avoid multiple "cleared" references becoming equal to each other 381 * (which violates the expectations of a map) we consider other nodes equal only if either: 382 * 1) they are the same instance 383 * 2) they have a non-null referent that's equal() to our referent 384 * 385 * Two distinct cleared references with the same hashcode must not compare as equal. 386 */ 387 @Override equals(Object obj)388 public boolean equals(Object obj) { 389 if (obj == this) { 390 return true; 391 } 392 // Don't worry about checking "instanceof" since this is a private type. 393 Node referent = get(); 394 return referent != null && referent.equals(((WeakNodeRef) obj).get()); 395 } 396 } 397 398 /** Minimal API for any logical operation which can be applied to two range trees. */ 399 interface LogicalOperation extends BiFunction<Node, Node, Node> { } 400 401 /** 402 * Implementation of the "union" operation for two sub-trees. The union of two sub-trees is a 403 * sub-tree which matches digit sequences if-and-only-if they are matched by either of the 404 * original sub-trees. 405 */ 406 private final class Union implements LogicalOperation { 407 @Override 408 @SuppressWarnings("ReferenceEquality") apply(Node lhs, Node rhs)409 public Node apply(Node lhs, Node rhs) { 410 // Assert that inputs are always interned. 411 // NOTE: It might be worth doing checks for "TERMINAL" here as well. 412 if (lhs == rhs || rhs == null) { 413 // (A ∪ A) = A and (A ∪ ∅) = A 414 return lhs; 415 } else if (lhs == null) { 416 // (∅ ∪ B) = B 417 return rhs; 418 } 419 return recurse(this, lhs, rhs, lhs.canTerminate() || rhs.canTerminate()); 420 } 421 } 422 423 /** 424 * Implementation of the "intersection" operation for two sub-trees. The intersection of two 425 * sub-trees is a sub-tree which matches digit sequences if-and-only-if they are matched by 426 * both the original sub-trees. 427 */ 428 private final class Intersection implements LogicalOperation { 429 @Override 430 @SuppressWarnings("ReferenceEquality") apply(Node lhs, Node rhs)431 public Node apply(Node lhs, Node rhs) { 432 // Assert that inputs are always interned. 433 // NOTE: It might be worth doing checks for "TERMINAL" here as well. 434 if (lhs == rhs) { 435 // (A ∩ A) = A 436 return lhs; 437 } else if (lhs == null || rhs == null) { 438 // (∅ ∩ X) = ∅ for any X 439 return null; 440 } 441 return recurse(this, lhs, rhs, lhs.canTerminate() && rhs.canTerminate()); 442 } 443 } 444 445 /** 446 * Implementation of the "subtraction" operation for two sub-trees. The subtraction of two 447 * sub-trees {@code A} and {@code B}, is a sub-tree which matches digit sequences if-and-only-if 448 * they are matched by {@code A} but not {@code B}. This is not a symmetrical operation. 449 */ 450 private final class Subtraction implements LogicalOperation { 451 @Override 452 @SuppressWarnings("ReferenceEquality") apply(Node lhs, Node rhs)453 public Node apply(Node lhs, Node rhs) { 454 // Assert that inputs are always interned. 455 // NOTE: It might be worth doing checks for "TERMINAL" here as well. 456 if (lhs == rhs || lhs == null) { 457 // (A ∖ A) = ∅ and (∅ ∖ B) = ∅ 458 return null; 459 } else if (rhs == null) { 460 // (A ∖ ∅) = A 461 return lhs; 462 } 463 return recurse(this, lhs, rhs, lhs.canTerminate() && !rhs.canTerminate()); 464 } 465 } 466 467 private final class Filter implements LogicalOperation { 468 // IMPORTANT: The prefix is neither returned, nor tested directly against instances in the 469 // range tree being filtered (other than the singleton TERMINAL node) which means it need 470 // not be interned before calling this function. If this method were ever changed to return 471 // nodes from the prefix tree or test instance equality with (interned) nodes in the range 472 // tree, then the prefix tree must also be interned before this method is called. 473 @Override 474 @SuppressWarnings("ReferenceEquality") apply(Node prefix, Node range)475 public Node apply(Node prefix, Node range) { 476 // Assert that ranges are always interned (prefixes don't need to be since we never return 477 // nodes in the prefix tree to form part of the filtered range). 478 if (prefix == null || range == null) { 479 return null; 480 } 481 // If we get to the end of the prefix, just return whatever's left in the range. 482 if (prefix == Node.TERMINAL) { 483 return range; 484 } 485 // Still "in" the prefix but we hit the end of the range (e.g. "123".filter("12") == ∅ 486 if (range == Node.TERMINAL) { 487 return null; 488 } 489 // Since we only recurse while still "in" the prefix we are never terminating (e.g. 490 // "123".filter({"12", "1234"}) == {"1234"} and does not contain "12"). 491 return recurse(this, prefix, range, false); 492 } 493 } 494 495 // Singleton set operations instance to handle interning of nodes. 496 static final SetOperations INSTANCE = new SetOperations(); 497 498 /** 499 * Weak-referenced interning map. This cannot be a standard Guava {@code Interner} because it 500 * must recursively intern the targets of any nodes to ensure that once a Node is interned, all 501 * the nodes reachable from it are also interned. 502 */ 503 private final Map<WeakNodeRef, WeakNodeRef> interningMap = new ConcurrentHashMap<>(); 504 505 /** 506 * Referent queue onto which node references clear by GC will be put. The elements in this 507 * queue should correspond to unused entries in the map which need to be tidied up. 508 */ 509 private final ReferenceQueue<Node> tidyUpQueue = new ReferenceQueue<>(); 510 511 private final LogicalOperation unionFn = new Union(); 512 private final LogicalOperation intersectionFn = new Intersection(); 513 private final LogicalOperation subtractionFn = new Subtraction(); 514 private final LogicalOperation retainFromFn = new Filter(); 515 SetOperations()516 private SetOperations() { 517 intern(Node.TERMINAL); 518 } 519 520 /** 521 * Interns the target of an edge (this does not make the edge itself interned, but it does 522 * allow edges to be efficiently compared via their targets). If the target of the given edge 523 * was already interned then it is just returned. 524 */ 525 @SuppressWarnings("ReferenceEquality") internTarget(Edge edge)526 private Edge internTarget(Edge edge) { 527 Node target = intern(edge.target); 528 return (target == edge.target) ? edge : new Edge(edge.mask, target); 529 } 530 531 /** 532 * Recursively interns a node and all nodes reachable from it. Note that if the given nodes do 533 * not represent a minimal DFA, then the interning process itself won't necessarily produce a 534 * minimal result. The minimal DFA property of range trees exists by induction and assumes that 535 * all trees are constructed minimally and that logical operations produce minimal trees. Note 536 * that if necessary, the interning operation could ensure minimization but at the cost of some 537 * efficiency (you would use the EDGE_COLLECTOR to squash duplicate edges). 538 */ intern(Node node)539 private Node intern(Node node) { 540 WeakNodeRef ref = new WeakNodeRef(node, tidyUpQueue); 541 WeakNodeRef existingRef = interningMap.get(ref); 542 if (existingRef != null) { 543 // Claim strong reference once into a local variable. 544 Node interned = existingRef.get(); 545 if (interned != null) { 546 // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map). 547 ref.clear(); 548 return interned; 549 } 550 } 551 // In the vast majority of cases, the edges of the node being interned already reference 552 // interned targets. The returned list contains edges to (recursively) interned nodes. 553 // If the edges we get back are the edges of our node, we just need to add ourselves to the 554 // intern map. If our edges were not interned (and we aren't in the map yet) then we must 555 // make a duplicate node that has only the interned edges before adding it to the map. This 556 // preserves the property that interned nodes only ever connect to other interned nodes. 557 ImmutableList<Edge> edges = 558 node.edges.stream().map(this::internTarget).collect(toImmutableList()); 559 if (!node.edges.equals(edges)) { 560 // Clear the original reference before overwriting the node to avoid it being put on the 561 // tidy-up queue (otherwise as soon as "node" is overwritten, GC could enqueue "ref"). 562 ref.clear(); 563 // Create a new node with interned edges and a corresponding weak reference. 564 node = new Node(edges, node.canTerminate()); 565 ref = new WeakNodeRef(node, tidyUpQueue); 566 } 567 // Consider the race condition where another thread added this node in the meantime. We 568 // cannot obtain a strong reference until after the WeakNode is returned from the map, which 569 // means there's always a race condition under which the referenced node will be collected. 570 while (true) { 571 existingRef = interningMap.putIfAbsent(ref, ref); 572 if (existingRef == null) { 573 // Easy case: We succeeded in putting our new reference into the map, so return our node. 574 return node; 575 } 576 // There's still a risk that the reference became null after being found in the map. 577 Node interned = existingRef.get(); 578 if (interned != null) { 579 // Clear "ref" to prevent it going in the tidy-up queue (it wasn't added to the map). 580 ref.clear(); 581 return interned; 582 } 583 // The reference must have been garbage collected after the weak node was found. This is 584 // very rare but possible, and the only real strategy is to try again. We can't really end 585 // up in a loop here unless we're continuously garbage collecting (i.e. bigger problems). 586 // We can't find the same reference again (it was cleared and is no longer equal-to "ref"). 587 } 588 } 589 590 // Remove all WeakNodeRefs that have been cleared by the garbage collector. This should 591 // precisely account for all weak nodes in the interning map which have been cleared by the 592 // garbage collector (weak nodes that were never added to the map are cleared manually and 593 // should not appear in the tidy-up queue). tidyUpInterningMap()594 private void tidyUpInterningMap() { 595 Reference<?> ref; 596 while ((ref = tidyUpQueue.poll()) != null) { 597 interningMap.remove(ref); 598 } 599 } 600 601 /** 602 * Applies the given operation recursively to a pair of interned nodes. The resulting node is 603 * interned and (if the input nodes were both minimal) minimal. 604 */ 605 @SuppressWarnings("ReferenceEquality") recurse(LogicalOperation logicalOp, Node lhs, Node rhs, boolean canTerminate)606 Node recurse(LogicalOperation logicalOp, Node lhs, Node rhs, boolean canTerminate) { 607 // Stage 1: Use the jump tables of target nodes to make a lookup of input edge index and mask. 608 // 609 // Each entry in the 'inputMap' array is a coded integer containing: 610 // [ lhs edge index | rhs edge index | bitmask of edges ] 611 // [ bits 20-24 | bits 16-20 | bits 0-10 ] 612 // 613 // Basically the top 16 bits are the indices for the inputs to the logical operation and the 614 // lower 16 bits are the mask of edge indices to which that result will apply. Because the 615 // map is constructed to avoid any duplication of the indices (the 'inputKey') we ensure that 616 // the logical operation is applied to the minimal number of unique inputs (no duplication). 617 long lhsJumpTable = lhs.jumpTable; 618 long rhsJumpTable = rhs.jumpTable; 619 // Note: Could reuse from a field (no longer thread safe, but that might be fine) 620 int[] inputMap = new int[10]; 621 int mapSize = 0; 622 // The digit mask runs from bit-0 (1) to bit-9. 623 for (int digitMask = 1; digitMask <= (1 << 9); digitMask <<= 1) { 624 int inputKey = (int) (((lhsJumpTable & 0xF) << 4) | (rhsJumpTable & 0xF)); 625 int n; 626 for (n = 0; n < mapSize; n++) { 627 if ((inputMap[n] >> 16) == inputKey) { 628 // Add this digit to an existing entry in the input map (the inputs are the same). 629 inputMap[n] |= digitMask; 630 break; 631 } 632 } 633 if (n == mapSize) { 634 // Add this digit to a new entry in the input map (and increase the map size). 635 inputMap[n] = (inputKey << 16) | digitMask; 636 mapSize++; 637 } 638 lhsJumpTable >>= 4; 639 rhsJumpTable >>= 4; 640 } 641 // Stage 2: Given a minimal set in inputs, perform the minimal number of logical operations. 642 // Note however that two operations can often return the same value (especially null or the 643 // TERMINAL node) so we have to minimize the results again and merge identical targets and 644 // masks. 645 List<Edge> out = new ArrayList<>(); 646 for (int n = 0; n < mapSize; n++) { 647 int mask = inputMap[n]; 648 // If lhs and rhs nodes are interned, then every target they reference is interned. 649 // We also assert that any nodes returned by logical operations are also interned. 650 Node node = logicalOp.apply( 651 lhs.targetFromJumpTableIndex((mask >> 20) & 0xF), 652 rhs.targetFromJumpTableIndex((mask >> 16) & 0xF)); 653 if (node == null) { 654 continue; 655 } 656 // Mask out the upper bits that are no longer needed. 657 mask &= RangeSpecification.ALL_DIGITS_MASK; 658 // Find if the result of the logical operation matches an existing result in the edge list. 659 int idx; 660 for (idx = 0; idx < out.size(); idx++) { 661 Edge e = out.get(idx); 662 if (e.target == node) { 663 // We matched an existing result, so replace the existing entry with a new edge which 664 // points to the same target but includes the new digits that also share this result. 665 out.set(idx, e.merge(mask)); 666 break; 667 } 668 } 669 if (idx == out.size()) { 670 // This is the first time this result was seen, so add it in a new entry. 671 out.add(new Edge(mask, node)); 672 } 673 } 674 // Stage 3: Given a minimal list of final edges (and after checking for degenerate cases 675 // (empty or terminating nodes) create and intern a new minimal node for the edges. 676 if (out.isEmpty()) { 677 return canTerminate ? Node.TERMINAL : null; 678 } else { 679 return intern(new Node(ImmutableList.copyOf(out), canTerminate)); 680 } 681 } 682 683 /** 684 * Returns the (minimal) logical union {@code '∪'} of two (minimal) sub-trees rooted at the 685 * given nodes. The sub-trees are interned before recursively applying the "union" function. 686 */ unionImpl(Node lhs, Node rhs)687 private Node unionImpl(Node lhs, Node rhs) { 688 if (lhs == null) { 689 return rhs; 690 } else if (rhs == null) { 691 return lhs; 692 } else { 693 return unionFn.apply(intern(lhs), intern(rhs)); 694 } 695 } 696 697 /** 698 * Returns the (minimal) logical intersection {@code '∩'} of two (minimal) sub-trees rooted at 699 * the given nodes. The sub-trees are interned before recursively applying the "union" function. 700 */ intersectImpl(Node lhs, Node rhs)701 private Node intersectImpl(Node lhs, Node rhs) { 702 if (lhs == null || rhs == null) { 703 return null; 704 } else { 705 return intersectionFn.apply(intern(lhs), intern(rhs)); 706 } 707 } 708 709 /** 710 * Returns the (minimal) logical subtraction {@code '∖'} of two (minimal) sub-trees rooted at 711 * the given nodes. The sub-trees are interned before recursively applying the "union" function. 712 */ subtractImpl(Node lhs, Node rhs)713 private Node subtractImpl(Node lhs, Node rhs) { 714 if (lhs == null) { 715 return null; 716 } else if (rhs == null) { 717 return lhs; 718 } else { 719 return subtractionFn.apply(intern(lhs), intern(rhs)); 720 } 721 } 722 retainFromImpl(Node prefix, Node range)723 private Node retainFromImpl(Node prefix, Node range) { 724 if (prefix == null || range == null) { 725 return null; 726 } 727 // As this operation never returns nodes that were in the prefix tree, or tests if prefix 728 // nodes are the same as instances in the range tree, there's no need to intern it. 729 return retainFromFn.apply(prefix, intern(range)); 730 } 731 732 /** Returns the union of one or more range trees. */ union(RangeTree first, RangeTree... rest)733 private RangeTree union(RangeTree first, RangeTree... rest) { 734 Node node = first.root; 735 for (RangeTree t : rest) { 736 node = unionImpl(node, t.root); 737 } 738 tidyUpInterningMap(); 739 return newOrEmptyTree(node); 740 } 741 742 /** 743 * Returns the union of two prefix trees. For prefix trees {@code p1} and {@code p2}, the union 744 * {@code P = p1.union(p2)} is defined such that: 745 * <pre>{@code 746 * P.filter(R) = p1.filter(R).union(p2.filter(R)) 747 * }</pre> 748 * If prefixes are the same length this is equivalent to {@link RangeTree#union(RangeTree)}, 749 * but when prefixes overlap, only the more general (shorter) prefix is retained. 750 */ union(PrefixTree lhs, PrefixTree rhs)751 PrefixTree union(PrefixTree lhs, PrefixTree rhs) { 752 // Using one prefix tree (A) to filter another (B), gives you the set of ranges in B which 753 // are, at least, also contained in A. The union of two prefix trees need only contain the 754 // more general (shorter) prefix and the more specific (longer) prefixes must be removed 755 // since they overlap with the more general ones. 756 // 757 // For example "12".retainFrom("1234") == "1234", but we don't want to retain "1234" in the 758 // final range tree (since it will already contain "12" anyway). 759 // 760 // If the same prefix exists in both inputs however, just doing this subtraction would remove 761 // it from the result (which is not what we want), so we also include the intersection of the 762 // prefix ranges. 763 RangeTree ltree = lhs.asRangeTree(); 764 RangeTree rtree = rhs.asRangeTree(); 765 return PrefixTree.from(union( 766 // Prefixes in both inputs (which would otherwise be removed by the subtractions below). 767 intersect(ltree, rtree), 768 // Prefixes in "lhs" which are strictly more general than any prefix in "rhs" 769 subtract(ltree, retainFrom(rhs, ltree)), 770 // Prefixes in "rhs" which are strictly more general than any prefix in "lhs" 771 subtract(rtree, retainFrom(lhs, rtree)))); 772 } 773 774 /** Returns the intersection of one or more range trees. */ intersect(RangeTree first, RangeTree... rest)775 private RangeTree intersect(RangeTree first, RangeTree... rest) { 776 Node node = first.root; 777 for (RangeTree t : rest) { 778 node = intersectImpl(node, t.root); 779 } 780 tidyUpInterningMap(); 781 return newOrEmptyTree(node); 782 } 783 784 /** 785 * Returns the intersection of two prefix trees. For prefix trees {@code p1} and {@code p2}, 786 * the intersection {@code P = p1.intersect(p2)} is defined such that: 787 * <pre>{@code 788 * P.filter(R) = p1.filter(R).intersect(p2.filter(R)) 789 * }</pre> 790 * If prefixes are the same length this is equivalent to {@link RangeTree#intersect(RangeTree)}, 791 * but when prefixes overlap, only the more specific (longer) prefix is retained. 792 */ intersect(PrefixTree lhs, PrefixTree rhs)793 PrefixTree intersect(PrefixTree lhs, PrefixTree rhs) { 794 return PrefixTree.from(union( 795 // Prefixes in "lhs" which are the same or more specific as any prefix in "rhs" 796 retainFrom(rhs, lhs.asRangeTree()), 797 // Prefixes in "rhs" which are the same or more specific as any prefix in "lhs" 798 retainFrom(lhs, rhs.asRangeTree()))); 799 } 800 801 /** Returns the difference of two range trees, {@code lhs - rhs}. */ subtract(RangeTree lhs, RangeTree rhs)802 private RangeTree subtract(RangeTree lhs, RangeTree rhs) { 803 Node node = subtractImpl(lhs.root, rhs.root); 804 tidyUpInterningMap(); 805 return newOrEmptyTree(node); 806 } 807 808 /** 809 * Returns a subset of the given ranges, containing only ranges which are prefixed by an 810 * element in the given prefix tree. For example: 811 * <pre> {@code 812 * RangeTree r = { "12xx", "1234x" } 813 * PrefixTree p = { "12[0-5]" } 814 * retainFrom(p, r) = { "12[0-5]x", "1234x"} 815 * }</pre> 816 * Note that if the prefix tree is empty, this method returns the empty range tree. 817 */ retainFrom(PrefixTree prefixes, RangeTree ranges)818 RangeTree retainFrom(PrefixTree prefixes, RangeTree ranges) { 819 Node node = retainFromImpl(prefixes.asRangeTree().root, ranges.root); 820 tidyUpInterningMap(); 821 return newOrEmptyTree(node); 822 } 823 } 824 825 /** 826 * Returns a minimal range tree for the given specification. The tree has only one path and only 827 * matches digit sequences of the same length as the given specification. 828 */ from(RangeSpecification s)829 public static RangeTree from(RangeSpecification s) { 830 Node node = Node.TERMINAL; 831 for (int n = s.length() - 1; n >= 0; n--) { 832 node = new Node(ImmutableList.of(new Edge(s.getBitmask(n), node)), false); 833 } 834 return newOrEmptyTree(node); 835 } 836 837 /** 838 * Returns a minimal range tree for the given specifications. This tree is formed as the logical 839 * union of the trees for all given specifications. 840 */ from(Iterable<RangeSpecification> specs)841 public static RangeTree from(Iterable<RangeSpecification> specs) { 842 SetOperations setOps = SetOperations.INSTANCE; 843 Node node = null; 844 for (RangeSpecification s : specs) { 845 node = setOps.unionImpl(node, from(s).root); 846 } 847 setOps.tidyUpInterningMap(); 848 return newOrEmptyTree(node); 849 } 850 851 /** 852 * Returns a minimal range tree for the given specifications. This tree is formed as the logical 853 * union of the trees for all given specifications. 854 */ from(Stream<RangeSpecification> specs)855 public static RangeTree from(Stream<RangeSpecification> specs) { 856 return from(specs.collect(toImmutableList())); 857 } 858 859 /** 860 * Returns a minimal range tree for the given digit sequence ranges. This tree is formed as the 861 * logical union of all the range specifications derived from the given ranges. 862 */ from(RangeSet<DigitSequence> ranges)863 public static RangeTree from(RangeSet<DigitSequence> ranges) { 864 // Currently we don't accept an empty range set in RangeSpecification.from(). 865 return !ranges.isEmpty() ? from(RangeSpecification.from(ranges)) : RangeTree.empty(); 866 } 867 868 /** 869 * Returns a range tree whose root is the given DfaNode. The given node must have been found by 870 * visiting an existing range tree. This method is useful for efficiently implementing "sub tree" 871 * logic in some cases. 872 * 873 * @throws IllegalArgumentException if the given node did not come from a valid range tree. 874 */ 875 @SuppressWarnings("ReferenceEquality") from(DfaNode root)876 public static RangeTree from(DfaNode root) { 877 checkNotNull(root, "root node cannot be null"); 878 checkArgument(root instanceof Node, 879 "invalid root node (wrong type='%s'): %s", root.getClass(), root); 880 Node node = (Node) root; 881 // Reference equality is correct since this is testing for interning. 882 checkArgument(node == SetOperations.INSTANCE.intern(node), 883 "invalid root node (not from valid RangeTree): %s", node); 884 return new RangeTree(node); 885 } 886 887 private static final RangeTree EMPTY = new RangeTree(); 888 889 /** Returns the enpty range tree, which matches only the empty digit sequence. */ empty()890 public static RangeTree empty() { 891 return EMPTY; 892 } 893 newOrEmptyTree(Node node)894 private static RangeTree newOrEmptyTree(Node node) { 895 return node != null ? new RangeTree(node) : EMPTY; 896 } 897 898 /** 899 * The root node, possibly null to signify the "empty" tree which matches no possible digit 900 * sequences (this is distinct from a tree that matches only the empty digit sequence). 901 */ 902 private final Node root; 903 private final long matchCount; 904 private final Supplier<ImmutableSortedSet<Integer>> lengths; 905 // Cached on demand. 906 private Integer hashCode = null; 907 908 /** Constructor for the singleton empty tree. */ RangeTree()909 private RangeTree() { 910 this.root = null; 911 // Unlike the terminal node (which matches the empty sequence), the empty tree matches nothing. 912 this.matchCount = 0L; 913 this.lengths = Suppliers.ofInstance(ImmutableSortedSet.of()); 914 } 915 916 /** Constructor for a non-empty tree. */ RangeTree(Node root)917 private RangeTree(Node root) { 918 this.root = Preconditions.checkNotNull(root); 919 this.matchCount = root.matchCount; 920 this.lengths = Suppliers.memoize(() -> calculateLengths(root)); 921 } 922 923 /** 924 * Returns whether this range tree accepts any input sequences. Note that in theory a range tree 925 * could accept the empty digit sequence (but in that case it would not be empty). An empty range 926 * tree cannot match any possible sequence. 927 */ isEmpty()928 public boolean isEmpty() { 929 return root == null; 930 } 931 calculateLengths(Node root)932 private static ImmutableSortedSet<Integer> calculateLengths(Node root) { 933 // Length mask cannot be 0 as it must match (at least) sequences of length 0. 934 int lengthMask = root.lengthMask; 935 ImmutableSortedSet.Builder<Integer> lengths = ImmutableSortedSet.naturalOrder(); 936 do { 937 int length = numberOfTrailingZeros(lengthMask); 938 lengths.add(length); 939 // Clear each bit as we go. 940 lengthMask &= ~(1 << length); 941 } while (lengthMask != 0); 942 return lengths.build(); 943 } 944 945 /** Returns the set of digit sequence lengths which could be matched by this range tree. */ getLengths()946 public ImmutableSortedSet<Integer> getLengths() { 947 return lengths.get(); 948 } 949 950 /** 951 * Returns the smallest digit sequence which will be accepted by this range tree, in 952 * {@link DigitSequence} order. Note that this is not the same as calling {@code sample(0)}, 953 * since {@link #sample(long)} does not use {@code DigitSequence} order. 954 * 955 * @return the smallest digit sequence accepted by this tree. 956 * @throws IllegalStateException if the tree is empty. 957 */ first()958 public DigitSequence first() { 959 checkState(!isEmpty(), "cannot get minimum sequence for an empty range tree"); 960 DigitSequence first = DigitSequence.empty(); 961 Node node = root; 962 int minLength = Integer.numberOfTrailingZeros(root.lengthMask); 963 if (minLength > 0) { 964 // Length mask is the mask for checking against the target node's length(s), so we pre-shift 965 // it by one (i.e. not "1 << minLength"). This is also why there needs to be a zero check 966 // around this loop since otherwise we would not correctly detect when the empty sequence was 967 // in the tree (we don't check the root node in this loop). 968 for (int lengthMask = 1 << (minLength - 1); lengthMask > 0; lengthMask >>>= 1) { 969 for (Edge e : node.edges) { 970 // Exit when we find the first edge for which the minimum length path can be reached. 971 // This is only possible for first() because edges are ordered by their minimum digit 972 // (you could not use a similar trick to implement a last() method). This break must be 973 // reached since at least once edge _must_ have the expected length bit set. 974 if ((e.target.lengthMask & lengthMask) != 0) { 975 first = first.extendBy(DigitSequence.singleton(Integer.numberOfTrailingZeros(e.mask))); 976 node = e.target; 977 break; 978 } 979 } 980 } 981 } 982 return first; 983 } 984 985 /** 986 * Returns a digit sequence in the range tree for a given sampling index (in the range 987 * {@code 0 <= index < size()}). Note that this method makes no promises about the specific 988 * ordering used since it is dependant on the internal tree structure. 989 * 990 * <p>However the mapping from index to sequence is guaranteed to be a bijection, so while it is 991 * not true that {@code sample(n).next().equals(sample(n+1))}, it is true that 992 * {@code sample(n).equals(sample(m))} if-and-only-if {@code n == m}. Thus a pseudo random sample 993 * of N distinct indices will result in N distinct sequences. 994 * 995 * <p>This method is not recommended for general iteration over a tree, since there can be 996 * trillions of digit sequences. 997 * 998 * @throws ArrayIndexOutOfBoundsException if the index is invalid. 999 */ sample(long index)1000 public DigitSequence sample(long index) { 1001 if (index < 0 || index >= size()) { 1002 throw new IndexOutOfBoundsException( 1003 String.format("index (%d) out of bounds [0...%d]", index, size())); 1004 } 1005 return recursiveGet(root, index); 1006 } 1007 1008 @SuppressWarnings("ReferenceEquality") // Nodes are interned. recursiveGet(Node node, long index)1009 private static DigitSequence recursiveGet(Node node, long index) { 1010 // We can assert that 0 <= index < node.matchCount by inspection (checked initially and true 1011 // by code inspection below). 1012 if (node.canTerminate()) { 1013 // Every recursion should end here (since at some point we traverse the final edge where the 1014 // index has been reduced to zero). However we also get here while still in the tree and must 1015 // decide whether to terminate if we see a terminating node. 1016 if (index == 0) { 1017 return DigitSequence.empty(); 1018 } 1019 // Subtract 1 to account for this early terminating digit sequence (it is reflected in the 1020 // match count so we must adjust our index before moving on). 1021 index -= 1; 1022 } 1023 // Should always have at least one out edge here so the mask isn't empty. 1024 checkState(node != Node.TERMINAL, "!!! Bad RangeTree !!!"); 1025 for (Edge e : node.edges) { 1026 long weightedCount = e.target.matchCount * Integer.bitCount(e.mask); 1027 if (index >= weightedCount) { 1028 // We are not following this edge, so adjust index and continue. 1029 index -= weightedCount; 1030 continue; 1031 } 1032 // Find which digit of the edge we are traversing. If we are in the Nth copy of the match 1033 // count we want the digit corresponding to the Nth bit in the edge mask. Achieve this by 1034 // repeatedly removing the lowest set bit each time around the loop (getting the lowest set 1035 // bit as a mask is way faster than getting it's bit position). 1036 int mask = e.mask; 1037 while (index >= e.target.matchCount) { 1038 index -= e.target.matchCount; 1039 mask &= ~Integer.lowestOneBit(mask); 1040 } 1041 return DigitSequence.singleton(Integer.numberOfTrailingZeros(mask)) 1042 .extendBy(recursiveGet(e.target, index)); 1043 } 1044 // Should be impossible since we should always find an edge for the current index. If we get 1045 // here something is very messed up with either this code or the internal data structure. 1046 throw new IllegalStateException("!!! Bad RangeTree !!!"); 1047 } 1048 1049 /** Returns the number of unique digit sequences contained in this range tree. */ size()1050 public long size() { 1051 return matchCount; 1052 } 1053 1054 // -------- Set-like operations -------- 1055 1056 /** Returns the minimal logical union of this instance and the given tree. */ union(RangeTree tree)1057 public RangeTree union(RangeTree tree) { 1058 return SetOperations.INSTANCE.union(this, tree); 1059 } 1060 1061 /** Returns the minimal logical intersection of this instance and the given tree. */ intersect(RangeTree tree)1062 public RangeTree intersect(RangeTree tree) { 1063 return SetOperations.INSTANCE.intersect(this, tree); 1064 } 1065 1066 /** Returns the minimal logical subtraction of the given tree from this instance. */ subtract(RangeTree tree)1067 public RangeTree subtract(RangeTree tree) { 1068 return SetOperations.INSTANCE.subtract(this, tree); 1069 } 1070 1071 /** Returns whether a given digit sequence is in the set of sequences defined by this tree. */ contains(DigitSequence digits)1072 public boolean contains(DigitSequence digits) { 1073 Node node = root; 1074 if (node == null) { 1075 return false; 1076 } 1077 for (int n = 0; n < digits.length(); n++) { 1078 node = node.findTarget(digits.getDigit(n)); 1079 if (node == null) { 1080 return false; 1081 } 1082 } 1083 return node.canTerminate(); 1084 } 1085 1086 /** 1087 * Returns true if the given tree is a subset of this instance. This is functionally equivalent 1088 * to {@code tree.subtract(this).isEmpty()}, but much more efficient in cases where it returns 1089 * false. 1090 */ containsAll(RangeTree tree)1091 public boolean containsAll(RangeTree tree) { 1092 if (tree.isEmpty()) { 1093 // Everything contains all the contents of the empty set (even the empty set). 1094 return true; 1095 } 1096 if (isEmpty()) { 1097 // Nothing is contained by the empty set. 1098 return false; 1099 } 1100 ContainsAllVisitor v = new ContainsAllVisitor(getInitial()); 1101 tree.getInitial().accept(v); 1102 return v.containsAll; 1103 } 1104 1105 // A very efficient test of tree containment (faster than doing "b.subtract(a).isEmpty()"). 1106 private static final class ContainsAllVisitor implements DfaVisitor { 1107 private boolean containsAll = true; 1108 private DfaNode current; ContainsAllVisitor(DfaNode node)1109 private ContainsAllVisitor(DfaNode node) { 1110 current = node; 1111 } 1112 1113 @SuppressWarnings("ReferenceEquality") 1114 @Override visit(DfaNode source, DfaEdge edge, DfaNode target)1115 public void visit(DfaNode source, DfaEdge edge, DfaNode target) { 1116 // Since nodes are interned once in a tree, '==' is sufficient and not potentially slow. 1117 if (current == source || !containsAll) { 1118 // An identical subtree means we can shortcut everything (also if we know we've failed). 1119 return; 1120 } 1121 // No containment if the "subset" tree has lengths not in the current tree. 1122 // This also effectively checks for termination at this node (via bit-0). 1123 if ((~current.getLengthMask() & source.getLengthMask()) != 0) { 1124 containsAll = false; 1125 return; 1126 } 1127 // Recursively check that the sub-tree of the target node is contained within one or more 1128 // edges of the current tree. 1129 int subMask = edge.getDigitMask(); 1130 for (DfaEdge e : current.getEdges()) { 1131 // Look at paths which are in both trees. 1132 int m = (e.getDigitMask() & subMask); 1133 if (m != 0) { 1134 DfaNode oldCurrent = current; 1135 current = e.getTarget(); 1136 edge.getTarget().accept(this); 1137 current = oldCurrent; 1138 if (!containsAll) { 1139 // Containment failure in some sub-tree. 1140 return; 1141 } 1142 // Clear bits we're accounted for. 1143 subMask &= ~m; 1144 } 1145 } 1146 // If not all edges were accounted for, this was not a valid sub-tree. 1147 if (subMask != 0) { 1148 containsAll = false; 1149 } 1150 } 1151 } 1152 1153 // -------- Non set-like operations (transforming a RangeTree in non set-like ways) -------- 1154 1155 /** A general mapping function for transforming a range tree via its specifications. */ map(Function<RangeSpecification, RangeSpecification> fn)1156 public RangeTree map(Function<RangeSpecification, RangeSpecification> fn) { 1157 return from(asRangeSpecifications().stream().map(fn)); 1158 } 1159 1160 /** 1161 * Returns a range tree which matches the same digit sequences as this instance down to the first 1162 * {@code n} digits. The returned tree is a super-set of this instance. 1163 */ significantDigits(int n)1164 public RangeTree significantDigits(int n) { 1165 checkArgument(n >= 0, "invalid significant digits"); 1166 return map(s -> s.first(n).extendByLength(Math.max(s.length() - n, 0))); 1167 } 1168 1169 /** Returns a range tree with the given "path" prefixed to the front. */ prefixWith(RangeSpecification prefix)1170 public RangeTree prefixWith(RangeSpecification prefix) { 1171 checkArgument(isEmpty() || getLengths().last() + prefix.length() <= DigitSequence.MAX_DIGITS, 1172 "cannot extend range tree (prefix '%s' too long): %s", prefix, this); 1173 return (prefix.length() > 0) ? map(prefix::extendBy) : this; 1174 } 1175 1176 /** 1177 * Slices a range tree at a single length. This is equivalent to {@code slice(length, length)}. 1178 */ slice(int length)1179 public RangeTree slice(int length) { 1180 return slice(length, length); 1181 } 1182 1183 /** 1184 * Slices a range tree within the specified length bounds. A path exists in the returned tree if 1185 * it's length is in the (inclusive) range {@code [minLength, maxLength]} or it was longer than 1186 * {@code maxLength} but has been truncated. Importantly the returned range tree is not a subset 1187 * of the original tree. 1188 * 1189 * <p>This method can be thought of as returning the "complete or partially complete digit 1190 * sequences up to the specified maximum length". It is useful for calculating prefixes which 1191 * match partial digit sequences (i.e. "as you type" formatting). 1192 * 1193 * <p>For example: 1194 * <pre> {@code 1195 * slice({ 12345, 67xxx, 89 }, 0, 3) == { 123, 67x, 89 } 1196 * slice({ 12345, 67xxx, 89 }, 3, 3) == { 123, 67x } 1197 * slice({ 12, 34, 5 }, 2, 3) == { 12, 34 } 1198 * slice({ 12, 34 }, 3, 3) == { } 1199 * }</pre> 1200 */ slice(int minLength, int maxLength)1201 public RangeTree slice(int minLength, int maxLength) { 1202 return from( 1203 asRangeSpecifications().stream() 1204 .filter(s -> s.length() >= minLength) 1205 .map(s -> s.first(maxLength))); 1206 } 1207 1208 // -------- Transformation APIs (converting a RangeTree to another representation) -------- 1209 1210 /** Returns the minimal, ordered list of range specifications represented by this tree. */ asRangeSpecifications()1211 public ImmutableList<RangeSpecification> asRangeSpecifications() { 1212 if (root == null) { 1213 return ImmutableList.of(); 1214 } 1215 List<RangeSpecification> out = new ArrayList<>(); 1216 int lenMask = root.lengthMask; 1217 1218 if ((lenMask & (lenMask - 1)) == 0) { 1219 // If this tree only matches one length of sequences, we can just serialize it directly. 1220 addSpecs(this.root, RangeSpecification.empty(), out); 1221 } else { 1222 // When a tree matches more than one length, we cannot just serialize it in one go, because 1223 // the tree for ["123", "####"] would serialize as: 1224 // ["123", "[02-9]###", "1[013-9]##", "12[0-24-9]#", "123#"] 1225 // and while the union of those 4-digit sequences is the same as "####", it's hardly minimal 1226 // or user friendly. In order to get a minimal serialization for a given length N, it is 1227 // sufficient to intersect the tree with the "allMatch" tree of length N (e.g. "####...") 1228 // and then serialize the result. 1229 SetOperations setOps = SetOperations.INSTANCE; 1230 for (Integer length : getLengths()) { 1231 // This can't be empty because we know there's at least one branch that matches digits 1232 // of the current length (or we would not have returned the length from getLengths()). 1233 addSpecs( 1234 setOps.intersectImpl(this.root, allMatch(length)), RangeSpecification.empty(), out); 1235 } 1236 setOps.tidyUpInterningMap(); 1237 } 1238 return ImmutableList.sortedCopyOf(Comparator.comparing(RangeSpecification::min), out); 1239 } 1240 1241 /** 1242 * Recursively adds the range specifications generated from the given sub-tree to the output 1243 * list. 1244 */ addSpecs(Node node, RangeSpecification spec, List<RangeSpecification> out)1245 private static void addSpecs(Node node, RangeSpecification spec, List<RangeSpecification> out) { 1246 if (node.canTerminate()) { 1247 out.add(spec); 1248 } 1249 for (Edge e : node.edges) { 1250 addSpecs(e.target, spec.extendByMask(e.mask), out); 1251 } 1252 } 1253 1254 /** Returns a node which accepts any digit sequences of the given length. */ allMatch(int length)1255 private static Node allMatch(int length) { 1256 Node node = Node.TERMINAL; 1257 for (int n = 0; n < length; n++) { 1258 node = new Node(ImmutableList.of(new Edge(ALL_DIGITS_MASK, node)), false); 1259 } 1260 return node; 1261 } 1262 1263 /** Returns the minimal covering of ranges specified by this tree. */ asRangeSet()1264 public ImmutableRangeSet<DigitSequence> asRangeSet() { 1265 ImmutableRangeSet.Builder<DigitSequence> out = ImmutableRangeSet.builder(); 1266 // Not all ranges create different range specifications are disjoint and this will merge then 1267 // into then minimal set. 1268 for (RangeSpecification s : asRangeSpecifications()) { 1269 out.addAll(s.asRanges()); 1270 } 1271 return out.build(); 1272 } 1273 1274 // -------- DFA visitor API -------- 1275 1276 /** 1277 * Accepts the given visitor on the root of this non-empty tree. 1278 * 1279 * @throws IllegalStateException if the tree is empty. 1280 */ accept(DfaVisitor visitor)1281 public void accept(DfaVisitor visitor) { 1282 checkState(root != null, "cannot accept a visitor on an empty range tree"); 1283 root.accept(visitor); 1284 } 1285 1286 /** 1287 * Returns the initial node of this non-empty tree. 1288 * 1289 * @throws IllegalStateException if the tree is empty. 1290 */ getInitial()1291 public DfaNode getInitial() { 1292 checkState(root != null, "cannot get the initial node from an empty range tree"); 1293 return root; 1294 } 1295 1296 /** Returns the singleton terminal node of any range tree. */ getTerminal()1297 public static DfaNode getTerminal() { 1298 return Node.TERMINAL; 1299 } 1300 1301 // -------- Miscellaneous Object APIs -------- 1302 1303 @SuppressWarnings("ReferenceEquality") 1304 @Override equals(Object o)1305 public boolean equals(Object o) { 1306 // This could also just convert both to range specifications and use that, but that's likely 1307 // a lot more work (not that this is trivial). If you really want fast equality of trees then 1308 // the interning map needs to be global (but also switched to a weak map). 1309 if (this == o) { 1310 return true; 1311 } 1312 if (!(o instanceof RangeTree)) { 1313 return false; 1314 } 1315 RangeTree other = (RangeTree) o; 1316 if (root == null && other.root == null) { 1317 // Empty trees are equal. 1318 return true; 1319 } 1320 if (root == null || other.root == null) { 1321 // An empty tree is never equal to a non-empty tree. 1322 return false; 1323 } 1324 // Intern both trees and see if their roots are now identical. 1325 SetOperations setOps = SetOperations.INSTANCE; 1326 return setOps.intern(root) == setOps.intern(((RangeTree) o).root); 1327 } 1328 1329 @Override hashCode()1330 public int hashCode() { 1331 if (hashCode == null) { 1332 hashCode = asRangeSpecifications().hashCode(); 1333 } 1334 return hashCode; 1335 } 1336 1337 /** Debugging only. */ 1338 @Override toString()1339 public String toString() { 1340 return asRangeSpecifications().toString(); 1341 } 1342 } 1343