1 #![allow(non_camel_case_types)] 2 3 use libc::{c_int, c_uchar, c_void}; 4 5 /// Regex wraps an RE2 regular expression. 6 /// 7 /// It cannot be used safely from multiple threads simultaneously. 8 pub struct Regex { 9 re: *mut re2_regexp, 10 } 11 12 unsafe impl Send for Regex {} 13 14 impl Drop for Regex { drop(&mut self)15 fn drop(&mut self) { 16 unsafe { 17 re2_regexp_free(self.re); 18 } 19 } 20 } 21 22 #[derive(Debug)] 23 pub struct Error(()); 24 25 impl Regex { new(pattern: &str) -> Result<Regex, Error>26 pub fn new(pattern: &str) -> Result<Regex, Error> { 27 unsafe { Ok(Regex { re: re2_regexp_new(pattern.into()) }) } 28 } 29 is_match(&self, text: &str) -> bool30 pub fn is_match(&self, text: &str) -> bool { 31 unsafe { 32 re2_regexp_match(self.re, text.into(), 0, text.len() as c_int) 33 } 34 } 35 find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't>36 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 37 FindMatches { re: self, text: text, last_end: 0, last_match: None } 38 } 39 find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>40 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 41 let (mut s, mut e): (c_int, c_int) = (0, 0); 42 let matched = unsafe { 43 re2_regexp_find( 44 self.re, 45 text.into(), 46 start as c_int, 47 text.len() as c_int, 48 &mut s, 49 &mut e, 50 ) 51 }; 52 if matched { 53 Some((s as usize, e as usize)) 54 } else { 55 None 56 } 57 } 58 } 59 60 pub struct FindMatches<'r, 't> { 61 re: &'r Regex, 62 text: &'t str, 63 last_end: usize, 64 last_match: Option<usize>, 65 } 66 67 // This implementation is identical to the one Rust uses, since both Rust's 68 // regex engine and RE2 handle empty matches in the same way. 69 impl<'r, 't> Iterator for FindMatches<'r, 't> { 70 type Item = (usize, usize); 71 next(&mut self) -> Option<(usize, usize)>72 fn next(&mut self) -> Option<(usize, usize)> { 73 fn next_after_empty(text: &str, i: usize) -> usize { 74 let b = match text.as_bytes().get(i) { 75 None => return text.len() + 1, 76 Some(&b) => b, 77 }; 78 let inc = if b <= 0x7F { 79 1 80 } else if b <= 0b110_11111 { 81 2 82 } else if b <= 0b1110_1111 { 83 3 84 } else { 85 4 86 }; 87 i + inc 88 } 89 90 if self.last_end > self.text.len() { 91 return None; 92 } 93 let (s, e) = match self.re.find_at(self.text, self.last_end) { 94 None => return None, 95 Some((s, e)) => (s, e), 96 }; 97 assert!(s >= self.last_end); 98 if s == e { 99 // This is an empty match. To ensure we make progress, start 100 // the next search at the smallest possible starting position 101 // of the next match following this one. 102 self.last_end = next_after_empty(&self.text, e); 103 // Don't accept empty matches immediately following a match. 104 // Just move on to the next match. 105 if Some(e) == self.last_match { 106 return self.next(); 107 } 108 } else { 109 self.last_end = e; 110 } 111 self.last_match = Some(self.last_end); 112 Some((s, e)) 113 } 114 } 115 116 // RE2 FFI is below. Note that this uses a hand-rolled C API that is defined 117 // in re2.cpp. 118 119 type re2_regexp = c_void; 120 121 #[repr(C)] 122 struct re2_string { 123 text: *const c_uchar, 124 len: c_int, 125 } 126 127 impl<'a> From<&'a str> for re2_string { from(s: &'a str) -> re2_string128 fn from(s: &'a str) -> re2_string { 129 re2_string { text: s.as_ptr(), len: s.len() as c_int } 130 } 131 } 132 133 extern "C" { re2_regexp_new(pat: re2_string) -> *mut re2_regexp134 fn re2_regexp_new(pat: re2_string) -> *mut re2_regexp; re2_regexp_free(re: *mut re2_regexp)135 fn re2_regexp_free(re: *mut re2_regexp); re2_regexp_match( re: *mut re2_regexp, text: re2_string, startpos: c_int, endpos: c_int, ) -> bool136 fn re2_regexp_match( 137 re: *mut re2_regexp, 138 text: re2_string, 139 startpos: c_int, 140 endpos: c_int, 141 ) -> bool; re2_regexp_find( re: *mut re2_regexp, text: re2_string, startpos: c_int, endpos: c_int, match_start: *mut c_int, match_end: *mut c_int, ) -> bool142 fn re2_regexp_find( 143 re: *mut re2_regexp, 144 text: re2_string, 145 startpos: c_int, 146 endpos: c_int, 147 match_start: *mut c_int, 148 match_end: *mut c_int, 149 ) -> bool; 150 } 151