1 use std::char; 2 use std::cmp; 3 use std::fmt::Debug; 4 use std::slice; 5 use std::u8; 6 7 use unicode; 8 9 // This module contains an *internal* implementation of interval sets. 10 // 11 // The primary invariant that interval sets guards is canonical ordering. That 12 // is, every interval set contains an ordered sequence of intervals where 13 // no two intervals are overlapping or adjacent. While this invariant is 14 // occasionally broken within the implementation, it should be impossible for 15 // callers to observe it. 16 // 17 // Since case folding (as implemented below) breaks that invariant, we roll 18 // that into this API even though it is a little out of place in an otherwise 19 // generic interval set. (Hence the reason why the `unicode` module is imported 20 // here.) 21 // 22 // Some of the implementation complexity here is a result of me wanting to 23 // preserve the sequential representation without using additional memory. 24 // In many cases, we do use linear extra memory, but it is at most 2x and it 25 // is amortized. If we relaxed the memory requirements, this implementation 26 // could become much simpler. The extra memory is honestly probably OK, but 27 // character classes (especially of the Unicode variety) can become quite 28 // large, and it would be nice to keep regex compilation snappy even in debug 29 // builds. (In the past, I have been careless with this area of code and it has 30 // caused slow regex compilations in debug mode, so this isn't entirely 31 // unwarranted.) 32 // 33 // Tests on this are relegated to the public API of HIR in src/hir.rs. 34 35 #[derive(Clone, Debug, Eq, PartialEq)] 36 pub struct IntervalSet<I> { 37 ranges: Vec<I>, 38 } 39 40 impl<I: Interval> IntervalSet<I> { 41 /// Create a new set from a sequence of intervals. Each interval is 42 /// specified as a pair of bounds, where both bounds are inclusive. 43 /// 44 /// The given ranges do not need to be in any specific order, and ranges 45 /// may overlap. new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I>46 pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> { 47 let mut set = IntervalSet { ranges: intervals.into_iter().collect() }; 48 set.canonicalize(); 49 set 50 } 51 52 /// Add a new interval to this set. push(&mut self, interval: I)53 pub fn push(&mut self, interval: I) { 54 // TODO: This could be faster. e.g., Push the interval such that 55 // it preserves canonicalization. 56 self.ranges.push(interval); 57 self.canonicalize(); 58 } 59 60 /// Return an iterator over all intervals in this set. 61 /// 62 /// The iterator yields intervals in ascending order. iter(&self) -> IntervalSetIter<I>63 pub fn iter(&self) -> IntervalSetIter<I> { 64 IntervalSetIter(self.ranges.iter()) 65 } 66 67 /// Return an immutable slice of intervals in this set. 68 /// 69 /// The sequence returned is in canonical ordering. intervals(&self) -> &[I]70 pub fn intervals(&self) -> &[I] { 71 &self.ranges 72 } 73 74 /// Expand this interval set such that it contains all case folded 75 /// characters. For example, if this class consists of the range `a-z`, 76 /// then applying case folding will result in the class containing both the 77 /// ranges `a-z` and `A-Z`. 78 /// 79 /// This returns an error if the necessary case mapping data is not 80 /// available. case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError>81 pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { 82 let len = self.ranges.len(); 83 for i in 0..len { 84 let range = self.ranges[i]; 85 if let Err(err) = range.case_fold_simple(&mut self.ranges) { 86 self.canonicalize(); 87 return Err(err); 88 } 89 } 90 self.canonicalize(); 91 Ok(()) 92 } 93 94 /// Union this set with the given set, in place. union(&mut self, other: &IntervalSet<I>)95 pub fn union(&mut self, other: &IntervalSet<I>) { 96 // This could almost certainly be done more efficiently. 97 self.ranges.extend(&other.ranges); 98 self.canonicalize(); 99 } 100 101 /// Intersect this set with the given set, in place. intersect(&mut self, other: &IntervalSet<I>)102 pub fn intersect(&mut self, other: &IntervalSet<I>) { 103 if self.ranges.is_empty() { 104 return; 105 } 106 if other.ranges.is_empty() { 107 self.ranges.clear(); 108 return; 109 } 110 111 // There should be a way to do this in-place with constant memory, 112 // but I couldn't figure out a simple way to do it. So just append 113 // the intersection to the end of this range, and then drain it before 114 // we're done. 115 let drain_end = self.ranges.len(); 116 117 let mut ita = (0..drain_end).into_iter(); 118 let mut itb = (0..other.ranges.len()).into_iter(); 119 let mut a = ita.next().unwrap(); 120 let mut b = itb.next().unwrap(); 121 loop { 122 if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { 123 self.ranges.push(ab); 124 } 125 let (it, aorb) = 126 if self.ranges[a].upper() < other.ranges[b].upper() { 127 (&mut ita, &mut a) 128 } else { 129 (&mut itb, &mut b) 130 }; 131 match it.next() { 132 Some(v) => *aorb = v, 133 None => break, 134 } 135 } 136 self.ranges.drain(..drain_end); 137 } 138 139 /// Subtract the given set from this set, in place. difference(&mut self, other: &IntervalSet<I>)140 pub fn difference(&mut self, other: &IntervalSet<I>) { 141 if self.ranges.is_empty() || other.ranges.is_empty() { 142 return; 143 } 144 145 // This algorithm is (to me) surprisingly complex. A search of the 146 // interwebs indicate that this is a potentially interesting problem. 147 // Folks seem to suggest interval or segment trees, but I'd like to 148 // avoid the overhead (both runtime and conceptual) of that. 149 // 150 // The following is basically my Shitty First Draft. Therefore, in 151 // order to grok it, you probably need to read each line carefully. 152 // Simplifications are most welcome! 153 // 154 // Remember, we can assume the canonical format invariant here, which 155 // says that all ranges are sorted, not overlapping and not adjacent in 156 // each class. 157 let drain_end = self.ranges.len(); 158 let (mut a, mut b) = (0, 0); 159 'LOOP: while a < drain_end && b < other.ranges.len() { 160 // Basically, the easy cases are when neither range overlaps with 161 // each other. If the `b` range is less than our current `a` 162 // range, then we can skip it and move on. 163 if other.ranges[b].upper() < self.ranges[a].lower() { 164 b += 1; 165 continue; 166 } 167 // ... similarly for the `a` range. If it's less than the smallest 168 // `b` range, then we can add it as-is. 169 if self.ranges[a].upper() < other.ranges[b].lower() { 170 let range = self.ranges[a]; 171 self.ranges.push(range); 172 a += 1; 173 continue; 174 } 175 // Otherwise, we have overlapping ranges. 176 assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); 177 178 // This part is tricky and was non-obvious to me without looking 179 // at explicit examples (see the tests). The trickiness stems from 180 // two things: 1) subtracting a range from another range could 181 // yield two ranges and 2) after subtracting a range, it's possible 182 // that future ranges can have an impact. The loop below advances 183 // the `b` ranges until they can't possible impact the current 184 // range. 185 // 186 // For example, if our `a` range is `a-t` and our next three `b` 187 // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply 188 // subtraction three times before moving on to the next `a` range. 189 let mut range = self.ranges[a]; 190 while b < other.ranges.len() 191 && !range.is_intersection_empty(&other.ranges[b]) 192 { 193 let old_range = range; 194 range = match range.difference(&other.ranges[b]) { 195 (None, None) => { 196 // We lost the entire range, so move on to the next 197 // without adding this one. 198 a += 1; 199 continue 'LOOP; 200 } 201 (Some(range1), None) | (None, Some(range1)) => range1, 202 (Some(range1), Some(range2)) => { 203 self.ranges.push(range1); 204 range2 205 } 206 }; 207 // It's possible that the `b` range has more to contribute 208 // here. In particular, if it is greater than the original 209 // range, then it might impact the next `a` range *and* it 210 // has impacted the current `a` range as much as possible, 211 // so we can quit. We don't bump `b` so that the next `a` 212 // range can apply it. 213 if other.ranges[b].upper() > old_range.upper() { 214 break; 215 } 216 // Otherwise, the next `b` range might apply to the current 217 // `a` range. 218 b += 1; 219 } 220 self.ranges.push(range); 221 a += 1; 222 } 223 while a < drain_end { 224 let range = self.ranges[a]; 225 self.ranges.push(range); 226 a += 1; 227 } 228 self.ranges.drain(..drain_end); 229 } 230 231 /// Compute the symmetric difference of the two sets, in place. 232 /// 233 /// This computes the symmetric difference of two interval sets. This 234 /// removes all elements in this set that are also in the given set, 235 /// but also adds all elements from the given set that aren't in this 236 /// set. That is, the set will contain all elements in either set, 237 /// but will not contain any elements that are in both sets. symmetric_difference(&mut self, other: &IntervalSet<I>)238 pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) { 239 // TODO(burntsushi): Fix this so that it amortizes allocation. 240 let mut intersection = self.clone(); 241 intersection.intersect(other); 242 self.union(other); 243 self.difference(&intersection); 244 } 245 246 /// Negate this interval set. 247 /// 248 /// For all `x` where `x` is any element, if `x` was in this set, then it 249 /// will not be in this set after negation. negate(&mut self)250 pub fn negate(&mut self) { 251 if self.ranges.is_empty() { 252 let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); 253 self.ranges.push(I::create(min, max)); 254 return; 255 } 256 257 // There should be a way to do this in-place with constant memory, 258 // but I couldn't figure out a simple way to do it. So just append 259 // the negation to the end of this range, and then drain it before 260 // we're done. 261 let drain_end = self.ranges.len(); 262 263 // We do checked arithmetic below because of the canonical ordering 264 // invariant. 265 if self.ranges[0].lower() > I::Bound::min_value() { 266 let upper = self.ranges[0].lower().decrement(); 267 self.ranges.push(I::create(I::Bound::min_value(), upper)); 268 } 269 for i in 1..drain_end { 270 let lower = self.ranges[i - 1].upper().increment(); 271 let upper = self.ranges[i].lower().decrement(); 272 self.ranges.push(I::create(lower, upper)); 273 } 274 if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { 275 let lower = self.ranges[drain_end - 1].upper().increment(); 276 self.ranges.push(I::create(lower, I::Bound::max_value())); 277 } 278 self.ranges.drain(..drain_end); 279 } 280 281 /// Converts this set into a canonical ordering. canonicalize(&mut self)282 fn canonicalize(&mut self) { 283 if self.is_canonical() { 284 return; 285 } 286 self.ranges.sort(); 287 assert!(!self.ranges.is_empty()); 288 289 // Is there a way to do this in-place with constant memory? I couldn't 290 // figure out a way to do it. So just append the canonicalization to 291 // the end of this range, and then drain it before we're done. 292 let drain_end = self.ranges.len(); 293 for oldi in 0..drain_end { 294 // If we've added at least one new range, then check if we can 295 // merge this range in the previously added range. 296 if self.ranges.len() > drain_end { 297 let (last, rest) = self.ranges.split_last_mut().unwrap(); 298 if let Some(union) = last.union(&rest[oldi]) { 299 *last = union; 300 continue; 301 } 302 } 303 let range = self.ranges[oldi]; 304 self.ranges.push(range); 305 } 306 self.ranges.drain(..drain_end); 307 } 308 309 /// Returns true if and only if this class is in a canonical ordering. is_canonical(&self) -> bool310 fn is_canonical(&self) -> bool { 311 for pair in self.ranges.windows(2) { 312 if pair[0] >= pair[1] { 313 return false; 314 } 315 if pair[0].is_contiguous(&pair[1]) { 316 return false; 317 } 318 } 319 true 320 } 321 } 322 323 /// An iterator over intervals. 324 #[derive(Debug)] 325 pub struct IntervalSetIter<'a, I: 'a>(slice::Iter<'a, I>); 326 327 impl<'a, I> Iterator for IntervalSetIter<'a, I> { 328 type Item = &'a I; 329 next(&mut self) -> Option<&'a I>330 fn next(&mut self) -> Option<&'a I> { 331 self.0.next() 332 } 333 } 334 335 pub trait Interval: 336 Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord 337 { 338 type Bound: Bound; 339 lower(&self) -> Self::Bound340 fn lower(&self) -> Self::Bound; upper(&self) -> Self::Bound341 fn upper(&self) -> Self::Bound; set_lower(&mut self, bound: Self::Bound)342 fn set_lower(&mut self, bound: Self::Bound); set_upper(&mut self, bound: Self::Bound)343 fn set_upper(&mut self, bound: Self::Bound); case_fold_simple( &self, intervals: &mut Vec<Self>, ) -> Result<(), unicode::CaseFoldError>344 fn case_fold_simple( 345 &self, 346 intervals: &mut Vec<Self>, 347 ) -> Result<(), unicode::CaseFoldError>; 348 349 /// Create a new interval. create(lower: Self::Bound, upper: Self::Bound) -> Self350 fn create(lower: Self::Bound, upper: Self::Bound) -> Self { 351 let mut int = Self::default(); 352 if lower <= upper { 353 int.set_lower(lower); 354 int.set_upper(upper); 355 } else { 356 int.set_lower(upper); 357 int.set_upper(lower); 358 } 359 int 360 } 361 362 /// Union the given overlapping range into this range. 363 /// 364 /// If the two ranges aren't contiguous, then this returns `None`. union(&self, other: &Self) -> Option<Self>365 fn union(&self, other: &Self) -> Option<Self> { 366 if !self.is_contiguous(other) { 367 return None; 368 } 369 let lower = cmp::min(self.lower(), other.lower()); 370 let upper = cmp::max(self.upper(), other.upper()); 371 Some(Self::create(lower, upper)) 372 } 373 374 /// Intersect this range with the given range and return the result. 375 /// 376 /// If the intersection is empty, then this returns `None`. intersect(&self, other: &Self) -> Option<Self>377 fn intersect(&self, other: &Self) -> Option<Self> { 378 let lower = cmp::max(self.lower(), other.lower()); 379 let upper = cmp::min(self.upper(), other.upper()); 380 if lower <= upper { 381 Some(Self::create(lower, upper)) 382 } else { 383 None 384 } 385 } 386 387 /// Subtract the given range from this range and return the resulting 388 /// ranges. 389 /// 390 /// If subtraction would result in an empty range, then no ranges are 391 /// returned. difference(&self, other: &Self) -> (Option<Self>, Option<Self>)392 fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) { 393 if self.is_subset(other) { 394 return (None, None); 395 } 396 if self.is_intersection_empty(other) { 397 return (Some(self.clone()), None); 398 } 399 let add_lower = other.lower() > self.lower(); 400 let add_upper = other.upper() < self.upper(); 401 // We know this because !self.is_subset(other) and the ranges have 402 // a non-empty intersection. 403 assert!(add_lower || add_upper); 404 let mut ret = (None, None); 405 if add_lower { 406 let upper = other.lower().decrement(); 407 ret.0 = Some(Self::create(self.lower(), upper)); 408 } 409 if add_upper { 410 let lower = other.upper().increment(); 411 let range = Self::create(lower, self.upper()); 412 if ret.0.is_none() { 413 ret.0 = Some(range); 414 } else { 415 ret.1 = Some(range); 416 } 417 } 418 ret 419 } 420 421 /// Compute the symmetric difference the given range from this range. This 422 /// returns the union of the two ranges minus its intersection. symmetric_difference( &self, other: &Self, ) -> (Option<Self>, Option<Self>)423 fn symmetric_difference( 424 &self, 425 other: &Self, 426 ) -> (Option<Self>, Option<Self>) { 427 let union = match self.union(other) { 428 None => return (Some(self.clone()), Some(other.clone())), 429 Some(union) => union, 430 }; 431 let intersection = match self.intersect(other) { 432 None => return (Some(self.clone()), Some(other.clone())), 433 Some(intersection) => intersection, 434 }; 435 union.difference(&intersection) 436 } 437 438 /// Returns true if and only if the two ranges are contiguous. Two ranges 439 /// are contiguous if and only if the ranges are either overlapping or 440 /// adjacent. is_contiguous(&self, other: &Self) -> bool441 fn is_contiguous(&self, other: &Self) -> bool { 442 let lower1 = self.lower().as_u32(); 443 let upper1 = self.upper().as_u32(); 444 let lower2 = other.lower().as_u32(); 445 let upper2 = other.upper().as_u32(); 446 cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) 447 } 448 449 /// Returns true if and only if the intersection of this range and the 450 /// other range is empty. is_intersection_empty(&self, other: &Self) -> bool451 fn is_intersection_empty(&self, other: &Self) -> bool { 452 let (lower1, upper1) = (self.lower(), self.upper()); 453 let (lower2, upper2) = (other.lower(), other.upper()); 454 cmp::max(lower1, lower2) > cmp::min(upper1, upper2) 455 } 456 457 /// Returns true if and only if this range is a subset of the other range. is_subset(&self, other: &Self) -> bool458 fn is_subset(&self, other: &Self) -> bool { 459 let (lower1, upper1) = (self.lower(), self.upper()); 460 let (lower2, upper2) = (other.lower(), other.upper()); 461 (lower2 <= lower1 && lower1 <= upper2) 462 && (lower2 <= upper1 && upper1 <= upper2) 463 } 464 } 465 466 pub trait Bound: 467 Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord 468 { min_value() -> Self469 fn min_value() -> Self; max_value() -> Self470 fn max_value() -> Self; as_u32(self) -> u32471 fn as_u32(self) -> u32; increment(self) -> Self472 fn increment(self) -> Self; decrement(self) -> Self473 fn decrement(self) -> Self; 474 } 475 476 impl Bound for u8 { min_value() -> Self477 fn min_value() -> Self { 478 u8::MIN 479 } max_value() -> Self480 fn max_value() -> Self { 481 u8::MAX 482 } as_u32(self) -> u32483 fn as_u32(self) -> u32 { 484 self as u32 485 } increment(self) -> Self486 fn increment(self) -> Self { 487 self.checked_add(1).unwrap() 488 } decrement(self) -> Self489 fn decrement(self) -> Self { 490 self.checked_sub(1).unwrap() 491 } 492 } 493 494 impl Bound for char { min_value() -> Self495 fn min_value() -> Self { 496 '\x00' 497 } max_value() -> Self498 fn max_value() -> Self { 499 '\u{10FFFF}' 500 } as_u32(self) -> u32501 fn as_u32(self) -> u32 { 502 self as u32 503 } 504 increment(self) -> Self505 fn increment(self) -> Self { 506 match self { 507 '\u{D7FF}' => '\u{E000}', 508 c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(), 509 } 510 } 511 decrement(self) -> Self512 fn decrement(self) -> Self { 513 match self { 514 '\u{E000}' => '\u{D7FF}', 515 c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(), 516 } 517 } 518 } 519 520 // Tests for interval sets are written in src/hir.rs against the public API. 521