1 use std::char; 2 use std::cmp::Ordering; 3 use std::fmt; 4 use std::ops; 5 use std::u32; 6 7 use crate::literal::LiteralSearcher; 8 use crate::prog::InstEmptyLook; 9 use crate::utf8::{decode_last_utf8, decode_utf8}; 10 11 /// Represents a location in the input. 12 #[derive(Clone, Copy, Debug)] 13 pub struct InputAt { 14 pos: usize, 15 c: Char, 16 byte: Option<u8>, 17 len: usize, 18 } 19 20 impl InputAt { 21 /// Returns true iff this position is at the beginning of the input. is_start(&self) -> bool22 pub fn is_start(&self) -> bool { 23 self.pos == 0 24 } 25 26 /// Returns true iff this position is past the end of the input. is_end(&self) -> bool27 pub fn is_end(&self) -> bool { 28 self.c.is_none() && self.byte.is_none() 29 } 30 31 /// Returns the character at this position. 32 /// 33 /// If this position is just before or after the input, then an absent 34 /// character is returned. char(&self) -> Char35 pub fn char(&self) -> Char { 36 self.c 37 } 38 39 /// Returns the byte at this position. byte(&self) -> Option<u8>40 pub fn byte(&self) -> Option<u8> { 41 self.byte 42 } 43 44 /// Returns the UTF-8 width of the character at this position. len(&self) -> usize45 pub fn len(&self) -> usize { 46 self.len 47 } 48 49 /// Returns whether the UTF-8 width of the character at this position 50 /// is zero. is_empty(&self) -> bool51 pub fn is_empty(&self) -> bool { 52 self.len == 0 53 } 54 55 /// Returns the byte offset of this position. pos(&self) -> usize56 pub fn pos(&self) -> usize { 57 self.pos 58 } 59 60 /// Returns the byte offset of the next position in the input. next_pos(&self) -> usize61 pub fn next_pos(&self) -> usize { 62 self.pos + self.len 63 } 64 } 65 66 /// An abstraction over input used in the matching engines. 67 pub trait Input: fmt::Debug { 68 /// Return an encoding of the position at byte offset `i`. at(&self, i: usize) -> InputAt69 fn at(&self, i: usize) -> InputAt; 70 71 /// Return the Unicode character occurring next to `at`. 72 /// 73 /// If no such character could be decoded, then `Char` is absent. next_char(&self, at: InputAt) -> Char74 fn next_char(&self, at: InputAt) -> Char; 75 76 /// Return the Unicode character occurring previous to `at`. 77 /// 78 /// If no such character could be decoded, then `Char` is absent. previous_char(&self, at: InputAt) -> Char79 fn previous_char(&self, at: InputAt) -> Char; 80 81 /// Return true if the given empty width instruction matches at the 82 /// input position given. is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool83 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; 84 85 /// Scan the input for a matching prefix. prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>86 fn prefix_at( 87 &self, 88 prefixes: &LiteralSearcher, 89 at: InputAt, 90 ) -> Option<InputAt>; 91 92 /// The number of bytes in the input. len(&self) -> usize93 fn len(&self) -> usize; 94 95 /// Whether the input is empty. is_empty(&self) -> bool96 fn is_empty(&self) -> bool { 97 self.len() == 0 98 } 99 100 /// Return the given input as a sequence of bytes. as_bytes(&self) -> &[u8]101 fn as_bytes(&self) -> &[u8]; 102 } 103 104 impl<'a, T: Input> Input for &'a T { at(&self, i: usize) -> InputAt105 fn at(&self, i: usize) -> InputAt { 106 (**self).at(i) 107 } 108 next_char(&self, at: InputAt) -> Char109 fn next_char(&self, at: InputAt) -> Char { 110 (**self).next_char(at) 111 } 112 previous_char(&self, at: InputAt) -> Char113 fn previous_char(&self, at: InputAt) -> Char { 114 (**self).previous_char(at) 115 } 116 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool117 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 118 (**self).is_empty_match(at, empty) 119 } 120 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>121 fn prefix_at( 122 &self, 123 prefixes: &LiteralSearcher, 124 at: InputAt, 125 ) -> Option<InputAt> { 126 (**self).prefix_at(prefixes, at) 127 } 128 len(&self) -> usize129 fn len(&self) -> usize { 130 (**self).len() 131 } 132 as_bytes(&self) -> &[u8]133 fn as_bytes(&self) -> &[u8] { 134 (**self).as_bytes() 135 } 136 } 137 138 /// An input reader over characters. 139 #[derive(Clone, Copy, Debug)] 140 pub struct CharInput<'t>(&'t [u8]); 141 142 impl<'t> CharInput<'t> { 143 /// Return a new character input reader for the given string. new(s: &'t [u8]) -> CharInput<'t>144 pub fn new(s: &'t [u8]) -> CharInput<'t> { 145 CharInput(s) 146 } 147 } 148 149 impl<'t> ops::Deref for CharInput<'t> { 150 type Target = [u8]; 151 deref(&self) -> &[u8]152 fn deref(&self) -> &[u8] { 153 self.0 154 } 155 } 156 157 impl<'t> Input for CharInput<'t> { at(&self, i: usize) -> InputAt158 fn at(&self, i: usize) -> InputAt { 159 if i >= self.len() { 160 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 161 } else { 162 let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); 163 InputAt { pos: i, c, byte: None, len: c.len_utf8() } 164 } 165 } 166 next_char(&self, at: InputAt) -> Char167 fn next_char(&self, at: InputAt) -> Char { 168 at.char() 169 } 170 previous_char(&self, at: InputAt) -> Char171 fn previous_char(&self, at: InputAt) -> Char { 172 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 173 } 174 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool175 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 176 use crate::prog::EmptyLook::*; 177 match empty.look { 178 StartLine => { 179 let c = self.previous_char(at); 180 at.pos() == 0 || c == '\n' 181 } 182 EndLine => { 183 let c = self.next_char(at); 184 at.pos() == self.len() || c == '\n' 185 } 186 StartText => at.pos() == 0, 187 EndText => at.pos() == self.len(), 188 WordBoundary => { 189 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 190 c1.is_word_char() != c2.is_word_char() 191 } 192 NotWordBoundary => { 193 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 194 c1.is_word_char() == c2.is_word_char() 195 } 196 WordBoundaryAscii => { 197 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 198 c1.is_word_byte() != c2.is_word_byte() 199 } 200 NotWordBoundaryAscii => { 201 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 202 c1.is_word_byte() == c2.is_word_byte() 203 } 204 } 205 } 206 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>207 fn prefix_at( 208 &self, 209 prefixes: &LiteralSearcher, 210 at: InputAt, 211 ) -> Option<InputAt> { 212 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 213 } 214 len(&self) -> usize215 fn len(&self) -> usize { 216 self.0.len() 217 } 218 as_bytes(&self) -> &[u8]219 fn as_bytes(&self) -> &[u8] { 220 self.0 221 } 222 } 223 224 /// An input reader over bytes. 225 #[derive(Clone, Copy, Debug)] 226 pub struct ByteInput<'t> { 227 text: &'t [u8], 228 only_utf8: bool, 229 } 230 231 impl<'t> ByteInput<'t> { 232 /// Return a new byte-based input reader for the given string. new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t>233 pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { 234 ByteInput { text, only_utf8 } 235 } 236 } 237 238 impl<'t> ops::Deref for ByteInput<'t> { 239 type Target = [u8]; 240 deref(&self) -> &[u8]241 fn deref(&self) -> &[u8] { 242 self.text 243 } 244 } 245 246 impl<'t> Input for ByteInput<'t> { at(&self, i: usize) -> InputAt247 fn at(&self, i: usize) -> InputAt { 248 if i >= self.len() { 249 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 250 } else { 251 InputAt { 252 pos: i, 253 c: None.into(), 254 byte: self.get(i).cloned(), 255 len: 1, 256 } 257 } 258 } 259 next_char(&self, at: InputAt) -> Char260 fn next_char(&self, at: InputAt) -> Char { 261 decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() 262 } 263 previous_char(&self, at: InputAt) -> Char264 fn previous_char(&self, at: InputAt) -> Char { 265 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 266 } 267 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool268 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 269 use crate::prog::EmptyLook::*; 270 match empty.look { 271 StartLine => { 272 let c = self.previous_char(at); 273 at.pos() == 0 || c == '\n' 274 } 275 EndLine => { 276 let c = self.next_char(at); 277 at.pos() == self.len() || c == '\n' 278 } 279 StartText => at.pos() == 0, 280 EndText => at.pos() == self.len(), 281 WordBoundary => { 282 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 283 c1.is_word_char() != c2.is_word_char() 284 } 285 NotWordBoundary => { 286 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 287 c1.is_word_char() == c2.is_word_char() 288 } 289 WordBoundaryAscii => { 290 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 291 if self.only_utf8 { 292 // If we must match UTF-8, then we can't match word 293 // boundaries at invalid UTF-8. 294 if c1.is_none() && !at.is_start() { 295 return false; 296 } 297 if c2.is_none() && !at.is_end() { 298 return false; 299 } 300 } 301 c1.is_word_byte() != c2.is_word_byte() 302 } 303 NotWordBoundaryAscii => { 304 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 305 if self.only_utf8 { 306 // If we must match UTF-8, then we can't match word 307 // boundaries at invalid UTF-8. 308 if c1.is_none() && !at.is_start() { 309 return false; 310 } 311 if c2.is_none() && !at.is_end() { 312 return false; 313 } 314 } 315 c1.is_word_byte() == c2.is_word_byte() 316 } 317 } 318 } 319 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>320 fn prefix_at( 321 &self, 322 prefixes: &LiteralSearcher, 323 at: InputAt, 324 ) -> Option<InputAt> { 325 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 326 } 327 len(&self) -> usize328 fn len(&self) -> usize { 329 self.text.len() 330 } 331 as_bytes(&self) -> &[u8]332 fn as_bytes(&self) -> &[u8] { 333 self.text 334 } 335 } 336 337 /// An inline representation of `Option<char>`. 338 /// 339 /// This eliminates the need to do case analysis on `Option<char>` to determine 340 /// ordinality with other characters. 341 /// 342 /// (The `Option<char>` is not related to encoding. Instead, it is used in the 343 /// matching engines to represent the beginning and ending boundaries of the 344 /// search text.) 345 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] 346 pub struct Char(u32); 347 348 impl fmt::Debug for Char { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result349 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 350 match char::from_u32(self.0) { 351 None => write!(f, "Empty"), 352 Some(c) => write!(f, "{:?}", c), 353 } 354 } 355 } 356 357 impl Char { 358 /// Returns true iff the character is absent. 359 #[inline] is_none(self) -> bool360 pub fn is_none(self) -> bool { 361 self.0 == u32::MAX 362 } 363 364 /// Returns the length of the character's UTF-8 encoding. 365 /// 366 /// If the character is absent, then `1` is returned. 367 #[inline] len_utf8(self) -> usize368 pub fn len_utf8(self) -> usize { 369 char::from_u32(self.0).map_or(1, |c| c.len_utf8()) 370 } 371 372 /// Returns true iff the character is a word character. 373 /// 374 /// If the character is absent, then false is returned. is_word_char(self) -> bool375 pub fn is_word_char(self) -> bool { 376 // is_word_character can panic if the Unicode data for \w isn't 377 // available. However, our compiler ensures that if a Unicode word 378 // boundary is used, then the data must also be available. If it isn't, 379 // then the compiler returns an error. 380 char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) 381 } 382 383 /// Returns true iff the byte is a word byte. 384 /// 385 /// If the byte is absent, then false is returned. is_word_byte(self) -> bool386 pub fn is_word_byte(self) -> bool { 387 match char::from_u32(self.0) { 388 Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), 389 None | Some(_) => false, 390 } 391 } 392 } 393 394 impl From<char> for Char { from(c: char) -> Char395 fn from(c: char) -> Char { 396 Char(c as u32) 397 } 398 } 399 400 impl From<Option<char>> for Char { from(c: Option<char>) -> Char401 fn from(c: Option<char>) -> Char { 402 c.map_or(Char(u32::MAX), |c| c.into()) 403 } 404 } 405 406 impl PartialEq<char> for Char { 407 #[inline] eq(&self, other: &char) -> bool408 fn eq(&self, other: &char) -> bool { 409 self.0 == *other as u32 410 } 411 } 412 413 impl PartialEq<Char> for char { 414 #[inline] eq(&self, other: &Char) -> bool415 fn eq(&self, other: &Char) -> bool { 416 *self as u32 == other.0 417 } 418 } 419 420 impl PartialOrd<char> for Char { 421 #[inline] partial_cmp(&self, other: &char) -> Option<Ordering>422 fn partial_cmp(&self, other: &char) -> Option<Ordering> { 423 self.0.partial_cmp(&(*other as u32)) 424 } 425 } 426 427 impl PartialOrd<Char> for char { 428 #[inline] partial_cmp(&self, other: &Char) -> Option<Ordering>429 fn partial_cmp(&self, other: &Char) -> Option<Ordering> { 430 (*self as u32).partial_cmp(&other.0) 431 } 432 } 433