1 /* 2 * Copyright (C) 2017 The Libphonenumber Authors. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 package com.google.i18n.phonenumbers.metadata; 17 18 import static com.google.common.base.Preconditions.checkArgument; 19 import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; 20 import static java.lang.Integer.numberOfLeadingZeros; 21 import static java.lang.Integer.numberOfTrailingZeros; 22 23 import com.google.common.collect.ContiguousSet; 24 import com.google.common.collect.ImmutableList; 25 import com.google.common.collect.Iterables; 26 import com.google.common.collect.Range; 27 import com.google.common.collect.RangeSet; 28 import java.util.ArrayList; 29 import java.util.Arrays; 30 import java.util.Comparator; 31 import java.util.Iterator; 32 import java.util.List; 33 import java.util.Set; 34 35 /** 36 * A compact representation of a disjoint set of ranges of digit sequences. This is a compact way 37 * to represent one or many ranges of digit sequences which share the same length. Examples include: 38 * <pre>{@code 39 * "01234" --> the singleton range containing only the digit sequence "01234" 40 * "012xx" --> the contiguous digit sequence range ["01200".."01299"] 41 * "012[3-5]6xx" --> the disjoint set of contiguous digit sequence ranges 42 * ["0123600".."0123699"], ["0124600".."0124699"], ["0125600".."0125699"] 43 * }</pre> 44 * Note that the sets of contiguous ranges defined by a {@code RangeSpecification} are always 45 * mutually disjoint. 46 * 47 * <p>Range specifications have a natural prefix based lexicographical ordering (based on the 48 * most-significant point at which a difference appears), but if you are comparing a disjoint set 49 * of range specifications (e.g. from a {@link RangeTree}) then it can be more intuitive to use an 50 * ordering based on the minimum digit sequence, but note this approach fails if the range 51 * specifications can overlap (e.g. comparing "1xx" and "100"). 52 */ 53 public final class RangeSpecification implements Comparable<RangeSpecification> { 54 /** The mask of all possible digits. */ 55 public static final char ALL_DIGITS_MASK = (1 << 10) - 1; 56 57 private static final RangeSpecification EMPTY = new RangeSpecification(""); 58 59 /** Returns the empty range specification, which matches only the empty digit sequence. */ empty()60 public static RangeSpecification empty() { 61 return EMPTY; 62 } 63 64 /** Returns the range specification of length one which matches any of the given digits. */ singleton(Iterable<Integer> digits)65 public static RangeSpecification singleton(Iterable<Integer> digits) { 66 int mask = 0; 67 for (int digit : digits) { 68 checkArgument(0 <= digit && digit <= 9, "bad digit value '%s'", digit); 69 mask |= (1 << digit); 70 } 71 return new RangeSpecification(String.valueOf((char) mask)); 72 } 73 74 /** Returns a new range specification which matches only the given non-empty digit sequence. */ from(DigitSequence s)75 public static RangeSpecification from(DigitSequence s) { 76 if (s.length() == 0) { 77 return RangeSpecification.empty(); 78 } 79 char[] masks = new char[s.length()]; 80 for (int n = 0; n < masks.length; n++) { 81 masks[n] = (char) (1 << s.getDigit(n)); 82 } 83 return new RangeSpecification(new String(masks)); 84 } 85 86 /** Returns a new range specification which matches any digit sequence of the specified length. */ any(int length)87 public static RangeSpecification any(int length) { 88 checkArgument(length >= 0); 89 if (length == 0) { 90 return RangeSpecification.empty(); 91 } 92 char[] masks = new char[length]; 93 Arrays.fill(masks, ALL_DIGITS_MASK); 94 return new RangeSpecification(new String(masks)); 95 } 96 97 /** 98 * Parses the string form of a range specification (e.g. "1234[57-9]xxx"). This must be 99 * correctly formed, including having all ranges be well formed (e.g. not "[33]", "[3-3]" or 100 * "[6-4]"). 101 * 102 * <p>Note that non-canonical ranges are permitted if the digits are in order (e.g. "[1234]", 103 * "[4-5]" or "[0-9]" but not "[4321]"). The returned range specification is canonical (e.g. 104 * {@code parse("12[34569]").toString() == "12[3-69]"}). 105 * 106 * <p>The empty string is parsed as the empty range specification. 107 * 108 * <p>The use of single ASCII underscores ("_") to group ranges and aid readability is supported 109 * during parsing but is not retained in the parsed result (e.g. 110 * {@code parse("12_34[5-8]_xxx_xxx").toString() == "1234[5-8]xxxxxx"}). Note that underscore may 111 * not be present inside ranges (e.g. "1_4") or at the ends of the range (e.g. "123xxx_"). 112 */ parse(String s)113 public static RangeSpecification parse(String s) { 114 if (s.isEmpty()) { 115 return empty(); 116 } 117 checkArgument(!s.startsWith("_") && !s.endsWith("_"), "cannot start/end with '_': %s", s); 118 StringBuilder bitmasks = new StringBuilder(); 119 boolean lastCharWasUnderscore = false; 120 for (int n = 0; n < s.length(); n++) { 121 char c = s.charAt(n); 122 switch (c) { 123 case '_': 124 checkArgument(!lastCharWasUnderscore, "cannot have multiple '_' in a row: %s", s); 125 lastCharWasUnderscore = true; 126 // Continue the for-loop rather than breaking out the switch to avoid resetting the flag. 127 continue; 128 case 'x': 129 bitmasks.append(ALL_DIGITS_MASK); 130 break; 131 case '[': 132 n += 1; 133 int end = s.indexOf(']', n); 134 checkArgument(end != -1, "unclosed range in specification: %s", s); 135 checkArgument(end > n, "empty range in specification: %s", s); 136 bitmasks.append(parseRange(s, n, end)); 137 n = end; 138 break; 139 default: 140 checkArgument('0' <= c && c <= '9', 141 "bad digit value '%s' in range specification: %s", c, s); 142 bitmasks.append((char) (1 << (c - '0'))); 143 break; 144 } 145 lastCharWasUnderscore = false; 146 } 147 return new RangeSpecification(bitmasks.toString()); 148 } 149 parseRange(String s, int start, int end)150 private static char parseRange(String s, int start, int end) { 151 int mask = 0; 152 for (int n = start; n < end;) { 153 char c = s.charAt(n++); 154 checkArgument('0' <= c && c <= '9', 155 "bad digit value '%s' in range specification: %s", c, s); 156 int shift = (c - '0'); 157 // check that this bit and all above it are zero (to ensure correct ordering). 158 checkArgument(mask >> shift == 0, "unordered range in specification: %s", s); 159 if (n == end || s.charAt(n) != '-') { 160 // Single digit not in a range. 161 mask |= 1 << shift; 162 continue; 163 } 164 n++; 165 checkArgument(n < end, "unclosed range in specification: %s", s); 166 c = s.charAt(n++); 167 checkArgument('0' <= c && c <= '9', 168 "bad digit value '%s' in range specification: %s", c, s); 169 int rshift = (c - '0'); 170 checkArgument(rshift > shift, "unordered range in specification: %s", s); 171 // Set bits from shift to rshift inclusive (e.g. 11111 & ~11 = 11100). 172 mask |= ((1 << (rshift + 1)) - 1) & ~((1 << shift) - 1); 173 } 174 return (char) mask; 175 } 176 177 /** 178 * Returns the canonical representation of the given ranges. The number of range specifications 179 * in the returned instance may be higher or lower than the number of given ranges. 180 * <p> 181 * NOTE: This is only used by RangeTree for generating a RangeTree from a RangeSet, and is not 182 * suitable as a public API (one day we might generate the RangeTree directly and be able to 183 * delete this code). 184 */ 185 static ImmutableList<RangeSpecification> from(RangeSet<DigitSequence> ranges) { 186 List<RangeSpecification> specs = new ArrayList<>(); 187 Set<Range<DigitSequence>> s = ranges.asRanges(); 188 checkArgument(!s.isEmpty(), "empty range set not permitted"); 189 // Make sure are ranges we use are canonicalized over the domain of DigitSequences (so Range 190 // operations (e.g. isConnected()) work as expected. See Range for more on why this matters. 191 Range<DigitSequence> cur = s.iterator().next().canonical(domain()); 192 checkArgument(!cur.contains(DigitSequence.empty()), 193 "empty digit sequence not permitted in range set"); 194 for (Range<DigitSequence> next : Iterables.skip(ranges.asRanges(), 1)) { 195 next = next.canonical(domain()); 196 if (cur.isConnected(next)) { 197 // Even though 'cur' and 'next' are both canonicalized, it's not guaranteed that they are 198 // closed-open (singleton ranges are fully closed and any range containing the maximum 199 // value must be closed. To "union" the two ranges we must also preserve the bound types. 200 cur = Range.range( 201 cur.lowerEndpoint(), cur.lowerBoundType(), 202 next.upperEndpoint(), next.upperBoundType()) 203 .canonical(domain()); 204 continue; 205 } 206 addRangeSpecsOf(cur, specs); 207 cur = next; 208 } 209 addRangeSpecsOf(cur, specs); 210 return ImmutableList.sortedCopyOf(Comparator.comparing(RangeSpecification::min), specs); 211 } 212 213 /** Adds the canonical minimal range specifications for a single range to the given list. */ 214 private static void addRangeSpecsOf(Range<DigitSequence> r, List<RangeSpecification> specs) { 215 // Given range is already canonical but may span multiple lengths. It's easier to view this 216 // as a contiguous set when finding first/last elements however to avoid worrying about bound 217 // types. A contiguous set is not an expensive class to create. 218 ContiguousSet<DigitSequence> s = ContiguousSet.create(r, domain()); 219 DigitSequence start = s.first(); 220 DigitSequence end = s.last(); 221 while (start.length() < end.length()) { 222 // Add <start> to "999..." for the current block length (the max domain value is all 9's). 223 DigitSequence blockEnd = DigitSequence.nines(start.length()); 224 addRangeSpecs(start, blockEnd, specs); 225 // Reset the start to the next length up (i.e. the "000..." sequence that's one longer). 226 start = blockEnd.next(); 227 } 228 // Finally and the range specs up to (and including) the end value. 229 addRangeSpecs(start, end, specs); 230 } 231 232 // Adds canonical minimal range specifications for the range of same-length digit sequences. 233 private static void addRangeSpecs( 234 DigitSequence start, DigitSequence end, List<RangeSpecification> specs) { 235 int length = start.length(); 236 checkArgument(end.length() == length); 237 238 // Masks contains a running total of the bitmasks we want to convert to RangeSpecifications. 239 // As processing proceeds, the mask array is reused. This is because the prefix used for 240 // successive range specifications is always a subset of the previous specifications and the 241 // trailing part of the array always fills up with the range mask for 'x' (i.e. [0-9]). 242 int[] masks = new int[length]; 243 244 // Stage 1: 245 // Starting from the last digit in the 'start' sequence, work up until we find something that 246 // is not a '0'. This is the first digit that needs to be adjusted to create a range 247 // specification covering it and the digits 'below' it. For example, the first specification 248 // for the range ["1200".."9999"] is "1[2-9]xx". 249 // Once a specification is emitted, the start value is adjusted to the next digit sequence 250 // immediately above the end of the emitted range, so after emitting "1[2-9]xx", start="2000". 251 // Once each range specification is emitted, we continue working 'up' the digit sequence until 252 // the next calculated start value exceeds the 'end' of our range. This specification cannot 253 // be emitted and signals the end of stage 1. 254 setBitmasks(masks, start); 255 for (int n = previousNon(0, start, length); n != -1; n = previousNon(0, start, n)) { 256 int loDigit = start.getDigit(n); 257 DigitSequence prefix = start.first(n); 258 DigitSequence blockEnd = prefix.extendBy(DigitSequence.nines(length - n)); 259 if (blockEnd.compareTo(end) > 0) { 260 // The end of this block would exceed the end of the main range, so we must stop. 261 break; 262 } 263 // The bitmasks we want is: 264 // <first (n-1) digits of 'start'> [loDigit..9] <any digits mask...> 265 masks[n] = bitmaskUpFrom(loDigit); 266 fillBitmasksAfter(masks, n); 267 specs.add(RangeSpecification.fromBitmasks(masks)); 268 // Adjust the range start now we have emitted the range specification. 269 start = blockEnd.next(); 270 } 271 272 // Stage 2: 273 // Very similar to stage 1, but work up from the last digit in the 'end' sequence. The 274 // difference now is that we look for the first digit that's not '9' and generate ranges that 275 // go down to the start of the range, not up to the end. Thus for ["0000", "1299"] the first 276 // specification generated is "1[0-2]xx", which is emitted at the end of the list. 277 int midIdx = specs.size(); 278 setBitmasks(masks, end); 279 for (int n = previousNon(9, end, length); n != -1; n = previousNon(9, end, n)) { 280 int hiDigit = end.getDigit(n); 281 DigitSequence prefix = end.first(n); 282 DigitSequence blockStart = prefix.extendBy(DigitSequence.zeros(length - n)); 283 if (blockStart.compareTo(start) < 0) { 284 // The start of this block would precede the start of the main range, so we must stop. 285 break; 286 } 287 // The bitmasks we want is: 288 // <first (n-1) digits of 'end'> [0..hiDigit] <any digits mask...> 289 masks[n] = bitmaskDownFrom(hiDigit); 290 fillBitmasksAfter(masks, n); 291 specs.add(midIdx, RangeSpecification.fromBitmasks(masks)); 292 // Adjust the range end now we have emitted the range specification. 293 end = blockStart.previous(); 294 } 295 296 // Stage 3: Having emitted the first and last set of range specifications, it only remains to 297 // emit the "center" specification in the middle of the list. This is special as neither bound 298 // is the end of a block. In previous stages, all partial ranges are either "up to 9" or 299 // "down to zero". For example: ["1234".."1789"] has the center range "1[3-6]xx", and 300 // ["1234".."1345"] has no center range at all. 301 if (start.compareTo(end) < 0) { 302 // Find the last digit before start and end combine (ie, 1200, 1299 --> 12xx --> n=1). We 303 // know that 'start' and 'end' are the same length and bound a range like: 304 // <prefix> [X..Y] [000..999] 305 // but X or Y could be 0 or 9 respectively (just not both). 306 // 307 // Note that we don't even both to test the first digit in the sequences because if 'start' 308 // and 'end' span a full range (e.g. [000.999]) we can just use the same code to fill the 309 // masks correctly anyway. 310 int n = start.length(); 311 while (--n > 0 && start.getDigit(n) == 0 && end.getDigit(n) == 9) {} 312 // Bitwise AND the masks for [X..9] and [0..Y] to get the mask for [X..Y]. 313 // Note that the "masks" array already contains the correct prefix digits up to (n-1). 314 masks[n] = bitmaskUpFrom(start.getDigit(n)) & bitmaskDownFrom(end.getDigit(n)); 315 fillBitmasksAfter(masks, n); 316 specs.add(midIdx, RangeSpecification.fromBitmasks(masks)); 317 } 318 } 319 320 // Sets the values in the given array to correspond to the digits in the given sequence. If a 321 // range specification were made from the resulting array it would match only that digit sequence. 322 private static void setBitmasks(int[] masks, DigitSequence s) { 323 for (int n = 0; n < s.length(); n++) { 324 masks[n] = 1 << s.getDigit(n); 325 } 326 } 327 328 /** 329 * Creates a range specification from a given array of integer masks. The Nth element of the 330 * array corresponds to the Nth element in the range specification, and mask values must be 331 * non-zero and have only bits 0 to 9 set. 332 */ 333 private static RangeSpecification fromBitmasks(int[] bitmasks) { 334 checkArgument(bitmasks.length <= DigitSequence.MAX_DIGITS, 335 "range specification too large"); 336 StringBuilder s = new StringBuilder(bitmasks.length); 337 s.setLength(bitmasks.length); 338 for (int n = 0; n < bitmasks.length; n++) { 339 int mask = bitmasks[n]; 340 checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "invalid bitmask: %s", mask); 341 s.setCharAt(n, (char) mask); 342 } 343 return new RangeSpecification(s.toString()); 344 } 345 346 // Fills the bitmasks after the given index with the "all digits" mask (i.e. matching [0-9]). 347 // This can accept -1 as the index since it always pre-increments before using it. 348 private static void fillBitmasksAfter(int[] masks, int n) { 349 // Because of the iterative way the mask array is handled, we can stop filling when we hit 350 // ALL_DIGITS_MASK because everything past that must already be filled. 351 while (++n < masks.length && masks[n] != ALL_DIGITS_MASK) { 352 masks[n] = ALL_DIGITS_MASK; 353 } 354 } 355 356 // Starting at digit-N, returns the index of the nearest preceding digit that's not equal to the 357 // given value (or -1 if no such digit exists). 358 private static int previousNon(int digit, DigitSequence s, int n) { 359 while (--n >= 0 && s.getDigit(n) == digit) {} 360 return n; 361 } 362 363 /** Returns the bitmask for the range {@code [n-9]}. */ 364 private static int bitmaskUpFrom(int n) { 365 return (-1 << n) & ALL_DIGITS_MASK; 366 } 367 368 /** Returns the bitmask for the range {@code [0-n]}. */ 369 private static int bitmaskDownFrom(int n) { 370 return ALL_DIGITS_MASK >>> (9 - n); 371 } 372 373 374 // String containing one bitmasks per character (bits 0..9). 375 private final String bitmasks; 376 // Minimum and maximum sequences (inclusive) which span the ranges defined by this specification. 377 // Caching this is deliberate, since we sort disjoint ranges using the minimum value. It might 378 // not be so useful to cache the maximum value though. 379 private final DigitSequence min; 380 private final DigitSequence max; 381 // Total number of sequences matched by this specification. 382 private final long sequenceCount; 383 384 private RangeSpecification(String bitmasks) { 385 int length = bitmasks.length(); 386 checkArgument(length <= DigitSequence.MAX_DIGITS, 387 "Range specification too long (%s digits)", length); 388 this.bitmasks = bitmasks; 389 long minValue = 0; 390 long maxValue = 0; 391 long sequenceCount = 1; 392 for (int n = 0; n < length; n++) { 393 int mask = bitmasks.charAt(n); 394 checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "invalid bitmask: %s", mask); 395 minValue = (minValue * 10) + numberOfTrailingZeros(mask); 396 maxValue = (maxValue * 10) + (31 - numberOfLeadingZeros(mask)); 397 sequenceCount *= Integer.bitCount(mask); 398 } 399 this.min = new DigitSequence(length, minValue); 400 this.max = new DigitSequence(length, maxValue); 401 this.sequenceCount = sequenceCount; 402 } 403 404 /** 405 * Returns the number of digits that this specification can match. This is the length of all 406 * digit sequences which can match this specification. 407 */ 408 public int length() { 409 return bitmasks.length(); 410 } 411 412 /** Returns the smallest digit sequence matched by this range. */ 413 public DigitSequence min() { 414 return min; 415 } 416 417 /** Returns the largest digit sequence matched by this range. */ 418 public DigitSequence max() { 419 return max; 420 } 421 422 /** Returns the total number of digit sequences matched by (contained in) this specification. */ 423 public long getSequenceCount() { 424 return sequenceCount; 425 } 426 427 /** 428 * Returns the bitmask of the Nth range in this specification. Bit-X (0 ≤ X ≤ 9) corresponds to 429 * the digit with value X. As every range in a specification must match at least one digit, this 430 * mask can never be zero. 431 */ 432 public int getBitmask(int n) { 433 return bitmasks.charAt(n); 434 } 435 436 /** 437 * Returns whether the given digit sequence is in one of the ranges specified by this instance. 438 * This is more efficient that obtaining the associated {@code RangeSet} and checking that. 439 */ 440 public boolean matches(DigitSequence digits) { 441 if (digits.length() != length()) { 442 return false; 443 } 444 for (int n = 0; n < length(); n++) { 445 if ((bitmasks.charAt(n) & (1 << digits.getDigit(n))) == 0) { 446 return false; 447 } 448 } 449 return true; 450 } 451 452 // Returns the next sequence in forward order which is contained by a range defined by this 453 // range specification, or null if none exists. The given sequence must not be matched by this 454 // specification. 455 private DigitSequence nextRangeStart(DigitSequence s) { 456 // Easy length based checks (this is where the fact that range specification only define ranges 457 // of the same length really simplifies things). 458 if (s.length() < length()) { 459 return min(); 460 } else if (s.length() > length()) { 461 return null; 462 } 463 // Algorithm: 464 // 1) Find the highest digit that isn't in the corresponding bitmask for the range. 465 // 2) Try and increase the digit value until it's inside the next available range. 466 // 3) If that fails, move back up the sequence and increment the next digit up. 467 // 4) Repeat until a digit can be adjusted to start a new range, or all digits are exhausted. 468 // If all digits exhausted, the sequence was above all ranges in this specification. 469 // Otherwise return a new sequence using the unchanged prefix of the original sequence, the 470 // newly adjusted digit and the trailing digits of the minimal sequence. 471 for (int n = 0; n < length(); n++) { 472 int d = s.getDigit(n); 473 int mask = bitmasks.charAt(n); 474 if ((mask & (1 << d)) != 0) { 475 continue; 476 } 477 while (true) { 478 // Digit 'd' is either outside the range mask (first time though the loop) or inside a 479 // range. Either way we want to find the next digit above it which is inside a range. 480 // First increment 'd', and then find the next set bit in the mask at or above that point. 481 // Not extra check is needed at the end of ranges because numberOfTrailingZeros(0)==32 482 // which neatly ensures that the new value of 'd' must be out-of-range. 483 // If mask=[3-58]: d=1-->d'=3, d=4-->d'=5, d=5-->d'=8, d=8-->d'>9 484 d++; 485 d += numberOfTrailingZeros(mask >>> d); 486 if (d <= 9) { 487 // Found the value of the largest digit which can be adjusted to start the next range. 488 // Everything higher than this digit is the same as the original sequence and everything 489 // lower that this digit is the same as the corresponding digit in the minimal value. 490 return s.first(n).extendBy(d).extendBy(min.last((length() - n) - 1)); 491 } 492 // No more bits available in this range, so go back up to the previous range. 493 if (--n < 0) { 494 // The sequence was above the last element in the set. 495 // Example: Range Spec: 1[2-8][3-8]456, Sequence: 188457 496 return null; 497 } 498 d = s.getDigit(n); 499 mask = bitmasks.charAt(n); 500 } 501 } 502 // If we finish the outer loop the given sequence was in a range (which is an error). 503 throw new IllegalArgumentException( 504 "Digit sequence '" + s + "' is in the range specified by: " + this); 505 } 506 507 // Given a sequence inside a range defined by this specification, return the highest sequence 508 // in the current range (possibly just the given sequence). 509 private DigitSequence currentRangeEnd(DigitSequence s) { 510 // Build up a value representing the trailing digits (which must always be 9's). 511 long nines = 0; 512 for (int n = length() - 1; n >= 0; n--, nines = (10 * nines) + 9) { 513 int mask = bitmasks.charAt(n); 514 if (mask == ALL_DIGITS_MASK) { 515 continue; 516 } 517 // The new digit is the top of the current range that the current sequence digit is in. 518 int d = nextUnsetBit(mask, s.getDigit(n)) - 1; 519 DigitSequence end = 520 s.first(n).extendBy(d).extendBy(new DigitSequence((length() - n) - 1, nines)); 521 // Edge case for cases like "12[34][09]x" where "1239x" and "1240x" abut. This adjustment 522 // will happen at most once because the second range cannot also include an upper bound 523 // ending at '9', since otherwise (mask == ALL_DIGITS_MASK) at this position. The next 524 // sequence must be terminated with zeros starting at the current position having "rolled 525 // over" on the digit above. 526 if (d == 9) { 527 DigitSequence next = end.next(); 528 if (matches(next)) { 529 d = nextUnsetBit(mask, 0) - 1; 530 end = next.first(n).extendBy(d).extendBy(new DigitSequence((length() - n) - 1, nines)); 531 } 532 } 533 return end; 534 } 535 // The range specification is entirely 'x', which means it's a single range. 536 return max; 537 } 538 539 /** 540 * Returns a generating iterator which iterates in forward order over the disjoint ranges defined 541 * by this specification. This is not actually as useful as you might expect because in a lot of 542 * cases you would be dealing with a sequence of range specifications and it's not true that all 543 * ranges from multiple specifications are disjoint. 544 */ 545 Iterable<Range<DigitSequence>> asRanges() { 546 return () -> new Iterator<Range<DigitSequence>>() { 547 // Start is always in a range. 548 private DigitSequence start = min; 549 550 @Override 551 public boolean hasNext() { 552 return start != null; 553 } 554 555 @Override 556 public Range<DigitSequence> next() { 557 DigitSequence end = currentRangeEnd(start); 558 Range<DigitSequence> r = Range.closed(start, end).canonical(DigitSequence.domain()); 559 start = nextRangeStart(end.next()); 560 return r; 561 } 562 }; 563 } 564 565 /** 566 * Returns a new range specification which is extended by the given mask value. For example: 567 * <pre>{@code 568 * "0123[4-6]".extendByMask(7) == "0123[4-6][0-2]" 569 * }</pre> 570 */ 571 public RangeSpecification extendByMask(int mask) { 572 checkArgument(mask > 0 && mask <= ALL_DIGITS_MASK, "bad mask value '%s'", mask); 573 return new RangeSpecification(bitmasks + ((char) mask)); 574 } 575 576 /** 577 * Returns a new range specification which is extended by the given specification. For example: 578 * <pre>{@code 579 * "0123[4-6]".extendBy("7[89]") == "0123[4-6]7[89]" 580 * }</pre> 581 */ 582 public RangeSpecification extendBy(RangeSpecification extra) { 583 return new RangeSpecification(bitmasks + extra.bitmasks); 584 } 585 586 /** 587 * Returns a new range specification which is extended by a sequence of any digits of the given 588 * length. For example: 589 * <pre>{@code 590 * "012".extendByLength(4) == "012xxxx" 591 * }</pre> 592 */ 593 public RangeSpecification extendByLength(int length) { 594 return this.extendBy(any(length)); 595 } 596 597 /** 598 * Returns a range specification containing only the first {@code n} digits. If the given length 599 * is the same or greater than the specification's length, this specification is returned. 600 * For example: 601 * <pre>{@code 602 * "01[2-4]xx".first(8) == "01[2-4]xx" (same instance) 603 * "01[2-4]xx".first(5) == "01[2-4]xx" (same instance) 604 * "01[2-4]xx".first(3) == "01[2-4]" 605 * "01[2-4]xx".first(0) == "" (the empty specification) 606 * }</pre> 607 */ 608 public RangeSpecification first(int n) { 609 checkArgument(n >= 0); 610 if (n == 0) { 611 return empty(); 612 } 613 return n < length() ? new RangeSpecification(bitmasks.substring(0, n)) : this; 614 } 615 616 /** 617 * Returns a range specification containing only the last {@code n} digits. If the given length 618 * is the same or greater than the specification's length, this specification is returned. 619 * For example: 620 * <pre>{@code 621 * "01[2-4]xx".last(8) == "01[2-4]xx" (same instance) 622 * "01[2-4]xx".last(5) == "01[2-4]xx" (same instance) 623 * "01[2-4]xx".last(3) == "[2-4]xx" 624 * "01[2-4]xx".last(0) == "" (the empty specification) 625 * }</pre> 626 */ 627 public RangeSpecification last(int n) { 628 checkArgument(n >= 0); 629 if (n == 0) { 630 return empty(); 631 } 632 return n < length() ? new RangeSpecification(bitmasks.substring(length() - n)) : this; 633 } 634 635 /** 636 * Returns a range specification with any trailing "any digit" sequence removed. For example: 637 * <pre>{@code 638 * "0123".getPrefix() == "0123" (same instance) 639 * "0123xx".getPrefix() == "0123" 640 * "xxx".getPrefix() == "" (the empty specification) 641 * }</pre> 642 */ 643 public RangeSpecification getPrefix() { 644 int length = length(); 645 while (length > 0 && getBitmask(length - 1) == ALL_DIGITS_MASK) { 646 length--; 647 } 648 return first(length); 649 } 650 651 @Override 652 public int compareTo(RangeSpecification other) { 653 int length = Math.min(length(), other.length()); 654 for (int i = 0; i < length; i++) { 655 int mask = getBitmask(i); 656 int otherMask = other.getBitmask(i); 657 if (mask == otherMask) { 658 continue; 659 } 660 int commonBits = mask & otherMask; 661 mask -= commonBits; 662 otherMask -= commonBits; 663 // At least one mask is still non-zero and they don't overlap. 664 // 665 // The mask with the lowest set bit is the smaller mask in the ordering, since that bit 666 // distinguishes a smaller prefix than can never exist in the other specification. 667 // Testing the number of trailing zeros is equivalent to finding the lowest set bit. 668 return Integer.compare(numberOfTrailingZeros(mask), numberOfTrailingZeros(otherMask)); 669 } 670 return Integer.compare(length(), other.length()); 671 } 672 673 @Override 674 public boolean equals(Object o) { 675 return (o instanceof RangeSpecification) && bitmasks.equals(((RangeSpecification) o).bitmasks); 676 } 677 678 @Override 679 public int hashCode() { 680 return bitmasks.hashCode(); 681 } 682 683 /** 684 * If you want lexicographical ordering of range specifications, don't use this method, use the 685 * {@code min().toString()}. This works assuming the ranges being compared are disjoint. 686 */ 687 @Override 688 public String toString() { 689 // Consider caching if it turns out that we are serializing a lot of these. 690 StringBuilder s = new StringBuilder(); 691 for (int n = 0; n < bitmasks.length(); n++) { 692 appendMask(bitmasks.charAt(n), s); 693 } 694 return s.toString(); 695 } 696 697 /** Returns the string representation of a single bit-mask. */ 698 public static String toString(int bitMask) { 699 checkArgument(bitMask > 0 && bitMask < (1 << 10), "bad mask value: %s", bitMask); 700 return appendMask(bitMask, new StringBuilder()).toString(); 701 } 702 703 static StringBuilder appendMask(int mask, StringBuilder out) { 704 if (mask == ALL_DIGITS_MASK) { 705 out.append('x'); 706 } else if (hasOneBit(mask)) { 707 out.append(asChar(numberOfTrailingZeros(mask))); 708 } else { 709 out.append('['); 710 for (int loBit = numberOfTrailingZeros(mask); 711 loBit != 32; 712 loBit = numberOfTrailingZeros(mask)) { 713 // Always append the loBit digit into the range. 714 out.append(asChar(loBit)); 715 int hiBit = nextUnsetBit(mask, loBit); 716 int numBits = hiBit - loBit; 717 if (numBits > 1) { 718 // Stylistically prefer "[34]" to "[3-4]" for compactness. 719 if (numBits > 2) { 720 out.append('-'); 721 } 722 out.append(asChar(hiBit - 1)); 723 } 724 // Clear the bits we've just processed before going back round the loop. 725 mask &= ~((1 << hiBit) - 1); 726 } 727 out.append(']'); 728 } 729 return out; 730 } 731 732 // Turns a value in the range [0-9] into the corresponding ASCII character. 733 private static char asChar(int digit) { 734 return (char) ('0' + digit); 735 } 736 737 // Determines if the given bit-mask has only one bit set. 738 private static boolean hasOneBit(int mask) { 739 return (mask & (mask - 1)) == 0; 740 } 741 742 private static int nextUnsetBit(int mask, int bit) { 743 // Example mask transform for [013-589] if bit=3: 744 // v-- bit=3 745 // 01100111011 746 // 00000000111 (1 << 3) - 1 747 // 01100111111 OR with mask 748 // 10011000000 Bitwise NOT 749 // ^-- return=6 750 return numberOfTrailingZeros(~(mask | ((1 << bit) - 1))); 751 } 752 } 753