1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.i18n.phonenumbers.metadata.regex; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; 21 22 import com.google.common.base.Preconditions; 23 import com.google.common.collect.ImmutableList; 24 import com.google.common.collect.ImmutableSortedSet; 25 import com.google.i18n.phonenumbers.metadata.RangeSpecification; 26 import java.util.Collection; 27 import java.util.List; 28 import java.util.Set; 29 import java.util.stream.Collectors; 30 import java.util.stream.Stream; 31 32 /** 33 * Value type for edges in NFA graphs of phone number regular expressions. Outside this package, 34 * this type is mainly used for examining NFA graphs which represent a regular expression, 35 * generated via {@link RangeTreeConverter#toNfaGraph}.. 36 * 37 * <p>Note that the ordering of edges is carefully designed to attempt to replicate as much of the 38 * existing intuition about ordering in regular expressions as possible. This should result in any 39 * generated expressions being as close to existing hand edited expressions as possible. 40 */ 41 public abstract class Edge implements Comparable<Edge> { 42 /** API for visiting composite edges; see also {@link #accept(Visitor)}. */ 43 public interface Visitor { 44 /** Visits a leaf node simple edge. */ visit(SimpleEdge edge)45 void visit(SimpleEdge edge); 46 /** 47 * Visits a composited sequence of edges. Note that sequences only ever contain disjunctions or 48 * simple edges, but never other sequences. For edges "a", "b", "c", this represents the 49 * concatenated edge "abc". 50 */ visitSequence(List<Edge> edges)51 void visitSequence(List<Edge> edges); 52 /** 53 * Visits a disjunction of parallel edges. Note that disjunctions only ever contain sequences 54 * or simple edges, but never other disjunctions. For edges "a", "b", "c", this represents the 55 * disjunctive group "(a|b|c)". 56 */ visitGroup(Set<Edge> edges, boolean isOptional)57 void visitGroup(Set<Edge> edges, boolean isOptional); 58 } 59 60 // The singleton epsilon edge. 61 private static final SimpleEdge EPSILON = new SimpleEdge(); 62 // The singleton edge matching any digit (i.e. 'x' or '\d'). 63 private static final SimpleEdge ANY = new SimpleEdge(ALL_DIGITS_MASK, false); 64 // The singleton edge optionally matching any digit (i.e. 'x?' or '\d?'). 65 private static final SimpleEdge OPTIONAL_ANY = ANY.optional(); 66 67 /** Returns an edge which accepts digits 0 to 9 according tothe bits set in the given mask. */ fromMask(int digitMask)68 public static SimpleEdge fromMask(int digitMask) { 69 return digitMask == ALL_DIGITS_MASK ? ANY : new SimpleEdge(digitMask, false); 70 } 71 72 /** 73 * Returns the epsilon edge which accepts zero length input and transitions immediately. This 74 * edge should only ever appear parallel to other edges, and not as the only transition between 75 * two nodes. 76 */ epsilon()77 public static SimpleEdge epsilon() { 78 return EPSILON; 79 } 80 81 /** Returns the edge which accepts any digit {@code [0-9]}. */ any()82 public static SimpleEdge any() { 83 return ANY; 84 } 85 86 /** Returns the edge which optionally accepts any digit {@code [0-9]}. */ optionalAny()87 public static SimpleEdge optionalAny() { 88 return OPTIONAL_ANY; 89 } 90 91 /** 92 * Returns the ordered concatenation of the given edges. If either edge is a concatenation, it 93 * is first expanded, so that the resulting edge contains only simple edges or disjunctions. 94 */ concatenation(Edge lhs, Edge rhs)95 public static Edge concatenation(Edge lhs, Edge rhs) { 96 checkArgument(!lhs.equals(EPSILON) && !rhs.equals(EPSILON), "cannot concatenate epsilon edges"); 97 // Don't make concatenations of concatenations; flatten them out so you only have singletons 98 // or disjunctions. This is equivalent to writing "xyz" instead of "x(yz)". 99 List<Edge> edges = Stream.of(lhs, rhs) 100 .flatMap( 101 e -> (e instanceof Concatenation) ? ((Concatenation) e).edges.stream() : Stream.of(e)) 102 .collect(Collectors.toList()); 103 return new Concatenation(edges); 104 } 105 106 /** 107 * Returns the disjunction of the given edges. If either edge is already a concatenation, it 108 * is first expanded, so that the resulting edge contains only simple edges or disjunctions. 109 */ disjunction(Collection<Edge> edges)110 public static Edge disjunction(Collection<Edge> edges) { 111 // Don't make disjunctions of disjunctions; flatten them out so you only have singletons, 112 // concatenations or epsilon. This is equivalent to writing "(x|y|z)" instead of "(x|(y|z))". 113 List<Edge> allEdges = edges.stream() 114 .flatMap( 115 e -> (e instanceof Disjunction) ? ((Disjunction) e).edges.stream() : Stream.of(e)) 116 .sorted() 117 .distinct() 118 .collect(Collectors.toList()); 119 // There should only ever be one epsilon when we make a disjunction (disjunctions are made when 120 // subgraphs collapse and each subgraph should only have one epsilon to make it optional). 121 // Epsilons sort to-the-left of everything, so if there is an epsilon it must be the first edge. 122 boolean isOptional = allEdges.get(0) == EPSILON; 123 if (isOptional) { 124 allEdges = allEdges.subList(1, allEdges.size()); 125 } 126 Preconditions.checkState(!allEdges.contains(EPSILON)); 127 return new Disjunction(allEdges, isOptional); 128 } 129 130 /** An edge optionally matching a single input token, or the epsilon transition. */ 131 public static final class SimpleEdge extends Edge { 132 private final int digitMask; 133 private final boolean isOptional; 134 135 // Constructor for singleton epsilon edge. SimpleEdge()136 private SimpleEdge() { 137 this.digitMask = 0; 138 // An optional epsilon makes no real sense. 139 this.isOptional = false; 140 } 141 SimpleEdge(int digitMask, boolean isOptional)142 private SimpleEdge(int digitMask, boolean isOptional) { 143 checkArgument(digitMask > 0 && digitMask < (1 << 10), "invalid bit mask %s", digitMask); 144 this.digitMask = digitMask; 145 this.isOptional = isOptional; 146 } 147 148 /** Returns the mask of digits accepted by this edge. */ getDigitMask()149 public int getDigitMask() { 150 return digitMask; 151 } 152 153 /** Returns whether this edge is optional. */ isOptional()154 public boolean isOptional() { 155 return isOptional; 156 } 157 158 /** Returns an optional version of this, non-optional edge. */ optional()159 public SimpleEdge optional() { 160 Preconditions.checkState(digitMask != 0, "cannot make epsilon optional"); 161 Preconditions.checkState(!isOptional, "edge already optional"); 162 return new SimpleEdge(digitMask, true); 163 } 164 165 @Override accept(Visitor visitor)166 public void accept(Visitor visitor) { 167 visitor.visit(this); 168 } 169 170 @Override equals(Object obj)171 public boolean equals(Object obj) { 172 return (obj instanceof SimpleEdge) && digitMask == ((SimpleEdge) obj).digitMask; 173 } 174 175 @Override hashCode()176 public int hashCode() { 177 return digitMask; 178 } 179 180 @Override compareTo(Edge rhs)181 public int compareTo(Edge rhs) { 182 if (rhs instanceof SimpleEdge) { 183 return compare((SimpleEdge) rhs); 184 } else { 185 // Composite types know how to compare themselves to SimpleEdges, so delegate to them but 186 // remember to invert the result since we are reversing the comparison order. 187 return -rhs.compareTo(this); 188 } 189 } 190 compare(SimpleEdge rhs)191 private int compare(SimpleEdge rhs) { 192 if (isOptional != rhs.isOptional) { 193 // Optional edges sort to-the-right of non-optional things. 194 return isOptional ? 1 : -1; 195 } 196 if (digitMask == rhs.digitMask) { 197 return 0; 198 } 199 if (digitMask == 0 || rhs.digitMask == 0) { 200 // Epsilon sorts to-the-left of everything. 201 return digitMask == 0 ? -1 : 1; 202 } 203 // Unlike many other places where range specifications are used, we cannot guarantee the 204 // ranges are disjoint here, so we sort on the reversed bitmask to favour the lowest set bit. 205 // This sorts 'x' ([0-9]) to the left of everything, and epsilon to the right of everything. 206 // I.e. "x" < "0", "0" < "1", "[0-3]" < "[0-2]", "9" < epsilon. 207 // 208 // Remember to logical-shift back down to avoid negative values. 209 int reverseLhsMask = (Integer.reverse(digitMask) >>> 22); 210 int reverseRhsMask = (Integer.reverse(rhs.digitMask) >>> 22); 211 // Compare in the opposite order, so the largest reversed value is ordered "to the left". 212 return Integer.compare(reverseRhsMask, reverseLhsMask); 213 } 214 } 215 216 // A sequence of edges (disjunctions or simple edges). 217 private static final class Concatenation extends Edge { 218 private final ImmutableList<Edge> edges; 219 Concatenation(Collection<Edge> edges)220 private Concatenation(Collection<Edge> edges) { 221 this.edges = ImmutableList.copyOf(edges); 222 } 223 224 @Override accept(Visitor visitor)225 public void accept(Visitor visitor) { 226 visitor.visitSequence(edges); 227 } 228 229 @Override equals(Object obj)230 public boolean equals(Object obj) { 231 return (obj instanceof Concatenation) && edges.equals(((Concatenation) obj).edges); 232 } 233 234 @Override hashCode()235 public int hashCode() { 236 return edges.hashCode(); 237 } 238 239 @Override compareTo(Edge rhs)240 public int compareTo(Edge rhs) { 241 if (rhs instanceof Concatenation) { 242 return compareEdges(edges, ((Concatenation) rhs).edges); 243 } else { 244 // Compare our first edge to the non-concatenation. If this compares as equal, order the 245 // concatenation between simple edges and disjunctions to break the tie and avoid implying 246 // that a concatenation and a non-concatenation are equal. 247 int comparison = -rhs.compareTo(edges.get(0)); 248 return comparison != 0 ? comparison : (rhs instanceof SimpleEdge ? 1 : -1); 249 } 250 } 251 } 252 253 // A disjunctive group of edges (sequences or simple edges). 254 private static final class Disjunction extends Edge { 255 private final ImmutableSortedSet<Edge> edges; 256 private final boolean isOptional; 257 Disjunction(Collection<Edge> edges, boolean isOptional)258 private Disjunction(Collection<Edge> edges, boolean isOptional) { 259 checkArgument(!edges.isEmpty()); 260 this.edges = ImmutableSortedSet.copyOf(edges); 261 this.isOptional = isOptional; 262 } 263 264 @Override accept(Visitor visitor)265 public void accept(Visitor visitor) { 266 visitor.visitGroup(edges, isOptional); 267 } 268 269 @Override equals(Object obj)270 public boolean equals(Object obj) { 271 return (obj instanceof Disjunction) && edges.equals(((Disjunction) obj).edges); 272 } 273 274 @Override hashCode()275 public int hashCode() { 276 // Negate bits here to be different from Concatenation. 277 return ~edges.hashCode(); 278 } 279 280 @Override compareTo(Edge rhs)281 public int compareTo(Edge rhs) { 282 if (rhs instanceof Disjunction) { 283 return compareEdges(edges.asList(), ((Disjunction) rhs).edges.asList()); 284 } else { 285 // Compare our first edge to the non-disjunction. If this compares as equal, order the 286 // disjunction to the right of the other edge to break the tie and avoid implying that 287 // a disjunction and a non-disjunction are equal. 288 int comparison = -rhs.compareTo(edges.asList().get(0)); 289 return comparison == 0 ? 1 : comparison; 290 } 291 } 292 } 293 294 /** 295 * Accepts a visitor on this edge, visiting any sub-edges from which it is composed. This is a 296 * double-dispatch visitor to avoid anyone processing edges needing to know about specific types. 297 * Only the immediate edge is visited and the visitor is then responsible for visiting child 298 * edges. 299 */ accept(Visitor visitor)300 public abstract void accept(Visitor visitor); 301 302 // Compare lists according to elements, and tie break on length if different. This is effectively 303 // a lexicographical ordering. compareEdges(ImmutableList<Edge> lhs, ImmutableList<Edge> rhs)304 private static int compareEdges(ImmutableList<Edge> lhs, ImmutableList<Edge> rhs) { 305 int minSize = Math.min(lhs.size(), rhs.size()); 306 for (int n = 0; n < minSize; n++) { 307 int compared = lhs.get(n).compareTo(rhs.get(n)); 308 if (compared != 0) { 309 return compared; 310 } 311 } 312 return Integer.compare(lhs.size(), rhs.size()); 313 } 314 315 @Override toString()316 public String toString() { 317 StringBuilder out = new StringBuilder(); 318 accept(new Visitor() { 319 @Override 320 public void visit(SimpleEdge e) { 321 if (e.equals(Edge.epsilon())) { 322 // Epsilon cannot be optional. 323 out.append("e"); 324 } else { 325 int m = e.getDigitMask(); 326 out.append(m == ALL_DIGITS_MASK ? "x" : RangeSpecification.toString(m)); 327 if (e.isOptional()) { 328 out.append('?'); 329 } 330 } 331 } 332 333 @Override 334 public void visitSequence(List<Edge> edges) { 335 edges.forEach(e -> e.accept(this)); 336 } 337 338 @Override 339 public void visitGroup(Set<Edge> edges, boolean isOptional) { 340 out.append("("); 341 edges.forEach(e -> { 342 e.accept(this); 343 out.append("|"); 344 }); 345 out.setLength(out.length() - 1); 346 out.append(isOptional ? ")?" : ")"); 347 } 348 }); 349 return out.toString(); 350 } 351 } 352