1 #![allow(non_camel_case_types)] 2 3 use std::fmt; 4 use std::ptr; 5 use std::str; 6 7 use libc::{c_int, c_void, size_t}; 8 9 pub struct Regex { 10 code: *mut code, 11 match_data: *mut match_data, 12 ovector: *mut size_t, 13 } 14 15 unsafe impl Send for Regex {} 16 17 impl Drop for Regex { drop(&mut self)18 fn drop(&mut self) { 19 unsafe { 20 pcre2_match_data_free_8(self.match_data); 21 pcre2_code_free_8(self.code); 22 } 23 } 24 } 25 26 pub struct Error { 27 code: c_int, 28 offset: size_t, 29 } 30 31 impl Regex { new(pattern: &str) -> Result<Regex, Error>32 pub fn new(pattern: &str) -> Result<Regex, Error> { 33 let mut error_code: c_int = 0; 34 let mut error_offset: size_t = 0; 35 let code = unsafe { 36 pcre2_compile_8( 37 pattern.as_ptr(), 38 pattern.len(), 39 // PCRE2 can get significantly faster in some cases depending 40 // on the permutation of these options (in particular, dropping 41 // UCP). We should endeavor to have a separate "ASCII compatible" 42 // benchmark. 43 PCRE2_UCP | PCRE2_UTF, 44 &mut error_code, 45 &mut error_offset, 46 ptr::null_mut(), 47 ) 48 }; 49 if code.is_null() { 50 return Err(Error { code: error_code, offset: error_offset }); 51 } 52 let err = unsafe { pcre2_jit_compile_8(code, PCRE2_JIT_COMPLETE) }; 53 if err < 0 { 54 panic!("pcre2_jit_compile_8 failed with error: {:?}", err); 55 } 56 let match_data = unsafe { 57 pcre2_match_data_create_from_pattern_8(code, ptr::null_mut()) 58 }; 59 if match_data.is_null() { 60 panic!("could not allocate match_data"); 61 } 62 let ovector = unsafe { pcre2_get_ovector_pointer_8(match_data) }; 63 if ovector.is_null() { 64 panic!("could not get ovector"); 65 } 66 Ok(Regex { code: code, match_data: match_data, ovector: ovector }) 67 } 68 is_match(&self, text: &str) -> bool69 pub fn is_match(&self, text: &str) -> bool { 70 self.find_at(text, 0).is_some() 71 } 72 find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't>73 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 74 FindMatches { re: self, text: text, last_match_end: 0 } 75 } 76 find_at(&self, text: &str, start: usize) -> Option<(usize, usize)>77 fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> { 78 // The man pages for PCRE2 say that pcre2_jit_match is the fastest 79 // way to execute a JIT match because it skips sanity checks. We also 80 // explicitly disable the UTF-8 validity check, but it's probably not 81 // necessary. 82 let err = unsafe { 83 pcre2_jit_match_8( 84 self.code, 85 text.as_ptr(), 86 text.len(), 87 start, 88 PCRE2_NO_UTF_CHECK, 89 self.match_data, 90 ptr::null_mut(), 91 ) 92 }; 93 if err == PCRE2_ERROR_NOMATCH { 94 None 95 } else if err < 0 { 96 panic!("unknown error code: {:?}", err) 97 } else { 98 Some(unsafe { (*self.ovector, *self.ovector.offset(1)) }) 99 } 100 } 101 } 102 103 pub struct FindMatches<'r, 't> { 104 re: &'r Regex, 105 text: &'t str, 106 last_match_end: usize, 107 } 108 109 impl<'r, 't> Iterator for FindMatches<'r, 't> { 110 type Item = (usize, usize); 111 next(&mut self) -> Option<(usize, usize)>112 fn next(&mut self) -> Option<(usize, usize)> { 113 match self.re.find_at(self.text, self.last_match_end) { 114 None => None, 115 Some((s, e)) => { 116 self.last_match_end = e; 117 Some((s, e)) 118 } 119 } 120 } 121 } 122 123 impl fmt::Debug for Error { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result124 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 125 const BUF_LEN: size_t = 256; 126 let mut buf = [0; BUF_LEN]; 127 let len = unsafe { 128 pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), BUF_LEN) 129 }; 130 if len < 0 { 131 write!( 132 f, 133 "Unknown PCRE error. (code: {:?}, offset: {:?})", 134 self.code, self.offset 135 ) 136 } else { 137 let msg = str::from_utf8(&buf[..len as usize]).unwrap(); 138 write!(f, "error at {:?}: {}", self.offset, msg) 139 } 140 } 141 } 142 143 // PCRE2 FFI. We only wrap the bits we need. 144 145 const PCRE2_UCP: u32 = 0x00020000; 146 const PCRE2_UTF: u32 = 0x00080000; 147 const PCRE2_NO_UTF_CHECK: u32 = 0x40000000; 148 const PCRE2_JIT_COMPLETE: u32 = 0x00000001; 149 const PCRE2_ERROR_NOMATCH: c_int = -1; 150 151 type code = c_void; 152 153 type match_data = c_void; 154 155 type compile_context = c_void; // unused 156 157 type general_context = c_void; // unused 158 159 type match_context = c_void; // unused 160 161 extern "C" { pcre2_compile_8( pattern: *const u8, len: size_t, options: u32, error_code: *mut c_int, error_offset: *mut size_t, context: *mut compile_context, ) -> *mut code162 fn pcre2_compile_8( 163 pattern: *const u8, 164 len: size_t, 165 options: u32, 166 error_code: *mut c_int, 167 error_offset: *mut size_t, 168 context: *mut compile_context, 169 ) -> *mut code; 170 pcre2_code_free_8(code: *mut code)171 fn pcre2_code_free_8(code: *mut code); 172 pcre2_match_data_create_from_pattern_8( code: *const code, context: *mut general_context, ) -> *mut match_data173 fn pcre2_match_data_create_from_pattern_8( 174 code: *const code, 175 context: *mut general_context, 176 ) -> *mut match_data; 177 pcre2_match_data_free_8(match_data: *mut match_data)178 fn pcre2_match_data_free_8(match_data: *mut match_data); 179 pcre2_get_ovector_pointer_8(match_data: *mut match_data) -> *mut size_t180 fn pcre2_get_ovector_pointer_8(match_data: *mut match_data) 181 -> *mut size_t; 182 pcre2_jit_compile_8(code: *const code, options: u32) -> c_int183 fn pcre2_jit_compile_8(code: *const code, options: u32) -> c_int; 184 pcre2_jit_match_8( code: *const code, subject: *const u8, length: size_t, startoffset: size_t, options: u32, match_data: *mut match_data, match_context: *mut match_context, ) -> c_int185 fn pcre2_jit_match_8( 186 code: *const code, 187 subject: *const u8, 188 length: size_t, 189 startoffset: size_t, 190 options: u32, 191 match_data: *mut match_data, 192 match_context: *mut match_context, 193 ) -> c_int; 194 pcre2_get_error_message_8( error_code: c_int, buf: *mut u8, buflen: size_t, ) -> c_int195 fn pcre2_get_error_message_8( 196 error_code: c_int, 197 buf: *mut u8, 198 buflen: size_t, 199 ) -> c_int; 200 } 201